bcopy_CortexA9.s [plain text]

/*
 * Copyright (c) 2010 Apple Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the Cortex-A9 processor:
 *
 *  void bcopy(const void * source,
 *             void * destination,
 *             size_t length);
 *
 *  void *memmove(void * destination,
 *                const void * source,
 *                size_t n);
 *
 *  void *memcpy(void * restrict destination,
 *               const void * restrict source,
 *               size_t n);
 *
 * All copy n successive bytes from source to destination.  Memmove and memcpy
 * return destination, whereas bcopy has no return value.  Copying takes place
 * as if it were through a temporary buffer -- after return destination
 * contains exactly the bytes from source, even if the buffers overlap (this is
 * not required of memcpy by the C standard; its behavior is undefined if the
 * buffers overlap, but we are holding ourselves to the historical behavior of
 * this function on OS X and iOS).
 */

#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD

/*****************************************************************************
 * Macros                                                                    *
 *****************************************************************************/

#define A9_ENTRY(name) \
	.align 2;\
	.globl _ ## name ## $VARIANT$CortexA9;\
	_ ## name ## $VARIANT$CortexA9:

#define ESTABLISH_FRAME \
	push   {r0,r4,r7,lr};\
	add     r7,     sp, #8
    
#define CLEAR_FRAME_AND_RETURN \
	pop    {r0,r4,r7,pc}
    
#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}

#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}

/*****************************************************************************
 *  entry points                                                             *
 *****************************************************************************/

.text
.syntax unified
.code 32

A9_ENTRY(bcopy)
//  Translate bcopy calls into memcpy calls by swapping the first and second
//  arguments.
	mov     r3,     r0
	mov     r0,     r1
	mov     r1,     r3

A9_ENTRY(memcpy)
A9_ENTRY(memmove)
//  Our preference is to copy the data in ascending address order, but if the
//  buffers overlap such that the beginning of the destination buffer aliases
//  the end of the source buffer, we need to copy in descending address order
//  instead to preserve the memmove semantics.  We detect this case with the
//  test:
//
//      destination - source < length    (unsigned compare)
//
//  If the address of the source buffer is higher than the address of the
//  destination buffer, this arithmetic can overflow, but the overflowed value
//  can only be smaller than length if the buffers do not overlap, so we don't
//  need to worry about false positives due to the overflow (they happen, but
//  only in cases where copying in either order is correct).
	subs    r3,     r0, r1
	bxeq    lr
	ESTABLISH_FRAME
	cmp     r3,     r2
	blo     L_descendingCopy

/*****************************************************************************
 *  ascending copy                                                           *
 *****************************************************************************/

//  The layout of the two buffers is such that we can use our preferred
//  (ascending address order) copy implementation.  Throughout this copy,
//  registers are used as follows:
//
//      r0  lowest unwritten address in the destination buffer.
//      r1  lowest unread address in the source buffer.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_ascendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
	subs    r2,         #4
	blo     L_ascendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_ascendingDestinationWordAligned
	ldrb    r3,    [r1],#1
	cmp     ip,         #2
	ldrbls  r4,    [r1],#1
	strb    r3,    [r0],#1
	ldrblo  r3,    [r1],#1
	add     r2,         ip
	strbls  r4,    [r0],#1
	strblo  r3,    [r0],#1
	subs    r2,         #4
	bhs     L_ascendingDestinationWordAligned
    
L_ascendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1],#1
	ldrbcs  ip,    [r1],#1
	ldrbmi  r4,    [r1]
	strbcs  r3,    [r0],#1
	strbcs  ip,    [r0],#1
	strbmi  r4,    [r0]
	CLEAR_FRAME_AND_RETURN
    
L_ascendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_ascendingUnalignedCopy

/*****************************************************************************
 *  ascending copy, both buffers have word alignment                         *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_ascendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_ascendingDestinationCachelineAligned
	ldr     r3,    [r1],#4
	subs    r2,         #4
	str     r3,    [r0],#4
	bhs     0b
	b       L_ascendingLengthLessThanSixtyFour

L_ascendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that 0x60 is the optimal lookahead for preload,
//  though anything between 0x40 and 0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldm     r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stm     r0!,    COPY_REGISTERS
	pld    [r1, #0x60]
	ldm     r1!,    COPY_REGISTERS
	pld    [r1, #0x60]
	stm     r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_ascendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldm     r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stm     r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmcs   r1!,   {r3,ip}
    stmcs   r0!,   {r3,ip}
    ldrmi   r3,    [r1],#4
    strmi   r3,    [r0],#4
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1],#2
	strhcs  r3,    [r0],#2
	ldrbmi  ip,    [r1]
	strbmi  ip,    [r0]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  ascending copy, source buffer is not word aligned                        *
 *****************************************************************************/

L_ascendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_ascendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_ascendingUnalignedVectorCopy
    ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bhs     0b
L_ascendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1],#1
    subs    r2,         #1
    strb    r3,    [r0],#1
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_ascendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_ascendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_ascendingUnalignedCachelineCopy
    vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
L_ascendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_ascendingUnalignedByteCleanup
0:  vld1.8 {d0},   [r1]!
    subs    r2,         #8
    vst1.8 {d0},   [r0,:64]!
    bhs     0b
    b       L_ascendingUnalignedByteCleanup
    
L_ascendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    vld1.8 {q0,q1},[r1]!
    pld    [r1, #0x60]
    vst1.8 {q0,q1},[r0,:256]!
    subs    r2,         #0x20
    bhs     L_ascendingUnalignedCachelineCopy
    b       L_ascendingUnalignedVectorCleanup

/*****************************************************************************
 *  descending copy                                                          *
 *****************************************************************************/

//  The layout of the two buffers is such that we must copy in descending-
//  address order.  Throughout this copy, registers are used as follows:
//
//      r0  lowest address in the destination buffer that has been written to.
//      r1  lowest address in the source buffer that has been read from.
//      r2  number of bytes remaining to copy less an offset that varies
//          with the size of the copies that are being made.
//      r3, r4, r5, r6, r8, r9, r10, r12
//          temporary registers used to hold the data during copies.
//      r12 also used as a scratch register for alignment / length calculations

L_descendingCopy:
//  We begin by checking if less than four bytes are to be copied; if so, we
//  branch directly to a small-buffer copy and return.  Otherwise, we copy up
//  to three bytes if needed to make the destination pointer have word (four
//  byte) alignment.
    add     r1,     r2
    add     r0,     r2
    subs    r2,         #4
	blo     L_descendingLengthLessThanFour
	ands    ip,     r0, #0x3
	beq     L_descendingDestinationWordAligned
	ldrb    r3,    [r1, #-1]!
	cmp     ip,         #2
	ldrbhs  r4,    [r1, #-1]!
	strb    r3,    [r0, #-1]!
	ldrbhi  r3,    [r1, #-1]!
	strbhs  r4,    [r0, #-1]!
	strbhi  r3,    [r0, #-1]!
	subs    r2,         ip
	bhs     L_descendingDestinationWordAligned
        
L_descendingLengthLessThanFour:
//  Conditionally copies up to three bytes, assuming no alignment.  This is
//  only used if the original length of the buffer is smaller than four.
	lsls    ip,     r2, #31
	ldrbcs  r3,    [r1, #-1]!
	ldrbcs  ip,    [r1, #-1]!
	ldrbmi  r4,    [r1, #-1]
	strbcs  r3,    [r0, #-1]!
	strbcs  ip,    [r0, #-1]!
	strbmi  r4,    [r0, #-1]
	CLEAR_FRAME_AND_RETURN
    
L_descendingDestinationWordAligned:
//  We know that the destination has word alignment.  If the source is not
//  similarly aligned, jump to an unaligned copy loop.
	tst     r1,         #0x3
	bne		L_descendingUnalignedCopy

/*****************************************************************************
 *  descending copy, both buffers have word alignment                        *
 *****************************************************************************/
    
//  If less than sixty-four bytes remain to be copied, jump directly to the
//  word-aligned cleanup path.  Otherwise, we copy up to 28 bytes as needed
//  to make the destination pointer have cacheline alignment.
	subs    r2,     r2, #0x3c
	blo     L_descendingLengthLessThanSixtyFour
0:  tst     r0,         #0x1c
	beq     L_descendingDestinationCachelineAligned
	ldr     r3,    [r1, #-4]!
	subs    r2,         #4
	str     r3,    [r0, #-4]!
	bhs     0b
	b       L_descendingLengthLessThanSixtyFour

L_descendingDestinationCachelineAligned:
//  Unrolled main copy loop; copies two cachelines (64 bytes) per iteration.
//  Empirical testing suggests that -0x80 is the optimal lookahead for preload,
//  though anything between -0x40 and -0x100 seems to be "acceptable".
	push    ADDITIONAL_CALLEE_SAVE_REGISTERS
0:	ldmdb   r1!,    COPY_REGISTERS
	subs    r2,     r2, #0x40
	stmdb   r0!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	ldmdb   r1!,    COPY_REGISTERS
	pld    [r1, #-0x80]
	stmdb   r0!,    COPY_REGISTERS
	bhs     0b
	pop     ADDITIONAL_CALLEE_SAVE_REGISTERS

L_descendingLengthLessThanSixtyFour:
//  Cleanup copy of up to 63 bytes.  We can assume that both the source and
//  destination addresses have word alignment here.
    tst     r2,         #0x30
    beq     1f
0:  ldmdb   r1!,   {r3,r4,r9,ip}
    sub     r2,     r2, #0x10
    stmdb   r0!,   {r3,r4,r9,ip}
    tst     r2,         #0x30
    bne     0b
1:  tst     r2,         #0xf
    beq     2f
    lsls    ip,     r2, #29
    ldmdbcs r1!,   {r3,ip}
    stmdbcs r0!,   {r3,ip}
    ldrmi   r3,    [r1, #-4]!
    strmi   r3,    [r0, #-4]!
	lsls    ip,     r2, #31
	ldrhcs  r3,    [r1, #-2]!
	strhcs  r3,    [r0, #-2]!
	ldrbmi  ip,    [r1, #-1]
	strbmi  ip,    [r0, #-1]
2:  CLEAR_FRAME_AND_RETURN

/*****************************************************************************
 *  descending copy, source buffer is not word aligned                       *
 *****************************************************************************/

L_descendingUnalignedCopy:
//  Destination buffer is word aligned, but source buffer is not.  Copy
//  byte-by-byte until the destination buffer has eightbyte alignment.
    subs    r2,         #4
    blo     L_descendingUnalignedByteCleanup
0:  tst     r0,         #0x7
    beq     L_descendingUnalignedVectorCopy
    ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bhs     0b
L_descendingUnalignedByteCleanup:
    adds    r2,         #8
    beq     1f
0:  ldrb    r3,    [r1, #-1]!
    subs    r2,         #1
    strb    r3,    [r0, #-1]!
    bne     0b
1:  CLEAR_FRAME_AND_RETURN
    
L_descendingUnalignedVectorCopy:
//  Destination buffer is eightbyte aligned.  Source buffer has unknown
//  alignment.  Use NEON to handle the misaligned copies.  We begin by copying
//  up to 24 bytes to get cacheline alignment of the destination buffer.
    subs    r2,         #0x18
    blo     L_descendingUnalignedVectorCleanup
0:  tst     r0,         #0x18
    beq     L_descendingUnalignedCachelineCopy
    sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
L_descendingUnalignedVectorCleanup:
    adds    r2,         #0x18
    blo     L_descendingUnalignedByteCleanup
0:  sub     r1,         #8
    vld1.8 {d0},   [r1]
    sub     r0,         #8
    vst1.8 {d0},   [r0,:64]
    subs    r2,         #8
    bhs     0b
    b       L_descendingUnalignedByteCleanup
    
L_descendingUnalignedCachelineCopy:
//  Main copy loop; moves 32 bytes per iteration.  Requires only byte alignment
//  of the source address.
    sub     r1,         #32
    sub     r0,         #32
    mov     r4,         #-32
0:  vld1.8 {q0,q1},[r1], r4
    pld    [r1, #-0x60]
    vst1.8 {q0,q1},[r0,:256], r4
    subs    r2,         #0x20
    bhs     0b
    add     r1,         #32
    add     r0,         #32
    b       L_descendingUnalignedVectorCleanup

#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD