/*
* Copyright (c) 2010 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*
* This file implements the following functions for the Cortex-A9 processor:
*
* void bcopy(const void * source,
* void * destination,
* size_t length) * void *memmove(void * destination,
* const void * source,
* size_t n) * void *memcpy(void * restrict destination,
* const void * restrict source,
* size_t n) * All copy n successive bytes from source to destination. Memmove and memcpy
* return destination, whereas bcopy has no return value. Copying takes place
* as if it were through a temporary buffer -- after return destination
* contains exactly the bytes from source, even if the buffers overlap (this is
* not required of memcpy by the C standard * this function on OS X and iOS).
*/
#include <arm/arch.h>
#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
/*****************************************************************************
* Macros *
*****************************************************************************/
#define A9_ENTRY(name) \
.align 2 _ ## name ## $VARIANT$CortexA9:
#define ESTABLISH_FRAME \
push {r0,r4,r7,lr}
#define CLEAR_FRAME_AND_RETURN \
pop {r0,r4,r7,pc}
#define ADDITIONAL_CALLEE_SAVE_REGISTERS {r5,r6,r8,r10}
#define COPY_REGISTERS {r3,r4,r5,r6,r8,r9,r10,r12}
/*****************************************************************************
* entry points *
*****************************************************************************/
.text
.syntax unified
.code 32
A9_ENTRY(bcopy)
// Translate bcopy calls into memcpy calls by swapping the first and second
// arguments.
mov r3, r0
mov r0, r1
mov r1, r3
A9_ENTRY(memcpy)
A9_ENTRY(memmove)
// Our preference is to copy the data in ascending address order, but if the
// buffers overlap such that the beginning of the destination buffer aliases
// the end of the source buffer, we need to copy in descending address order
// instead to preserve the memmove semantics. We detect this case with the
// test:
//
// destination - source < length (unsigned compare)
//
// If the address of the source buffer is higher than the address of the
// destination buffer, this arithmetic can overflow, but the overflowed value
// can only be smaller than length if the buffers do not overlap, so we don't
// need to worry about false positives due to the overflow (they happen, but
// only in cases where copying in either order is correct).
subs r3, r0, r1
bxeq lr
ESTABLISH_FRAME
cmp r3, r2
blo L_descendingCopy
/*****************************************************************************
* ascending copy *
*****************************************************************************/
// The layout of the two buffers is such that we can use our preferred
// (ascending address order) copy implementation. Throughout this copy,
// registers are used as follows:
//
// r0 lowest unwritten address in the destination buffer.
// r1 lowest unread address in the source buffer.
// r2 number of bytes remaining to copy less an offset that varies
// with the size of the copies that are being made.
// r3, r4, r5, r6, r8, r9, r10, r12
// temporary registers used to hold the data during copies.
// r12 also used as a scratch register for alignment / length calculations
L_ascendingCopy:
// We begin by checking if less than four bytes are to be copied// to three bytes if needed to make the destination pointer have word (four
// byte) alignment.
subs r2, #4
blo L_ascendingLengthLessThanFour
ands ip, r0, #0x3
beq L_ascendingDestinationWordAligned
ldrb r3, [r1],#1
cmp ip, #2
ldrbls r4, [r1],#1
strb r3, [r0],#1
ldrblo r3, [r1],#1
add r2, ip
strbls r4, [r0],#1
strblo r3, [r0],#1
subs r2, #4
bhs L_ascendingDestinationWordAligned
L_ascendingLengthLessThanFour:
// Conditionally copies up to three bytes, assuming no alignment. This is
// only used if the original length of the buffer is smaller than four.
lsls ip, r2, #31
ldrbcs r3, [r1],#1
ldrbcs ip, [r1],#1
ldrbmi r4, [r1]
strbcs r3, [r0],#1
strbcs ip, [r0],#1
strbmi r4, [r0]
CLEAR_FRAME_AND_RETURN
L_ascendingDestinationWordAligned:
// We know that the destination has word alignment. If the source is not
// similarly aligned, jump to an unaligned copy loop.
tst r1, #0x3
bne L_ascendingUnalignedCopy
/*****************************************************************************
* ascending copy, both buffers have word alignment *
*****************************************************************************/
// If less than sixty-four bytes remain to be copied, jump directly to the
// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
// to make the destination pointer have cacheline alignment.
subs r2, r2, #0x3c
blo L_ascendingLengthLessThanSixtyFour
0: tst r0, #0x1c
beq L_ascendingDestinationCachelineAligned
ldr r3, [r1],#4
subs r2, #4
str r3, [r0],#4
bhs 0b
b L_ascendingLengthLessThanSixtyFour
L_ascendingDestinationCachelineAligned:
// Unrolled main copy loop// though anything between 0x40 and 0x100 seems to be "acceptable".
push ADDITIONAL_CALLEE_SAVE_REGISTERS
0: ldm r1!, COPY_REGISTERS
subs r2, r2, #0x40
stm r0!, COPY_REGISTERS
pld [r1, #0x60]
ldm r1!, COPY_REGISTERS
pld [r1, #0x60]
stm r0!, COPY_REGISTERS
bhs 0b
pop ADDITIONAL_CALLEE_SAVE_REGISTERS
L_ascendingLengthLessThanSixtyFour:
// Cleanup copy of up to 63 bytes. We can assume that both the source and
// destination addresses have word alignment here.
tst r2, #0x30
beq 1f
0: ldm r1!, {r3,r4,r9,ip}
sub r2, r2, #0x10
stm r0!, {r3,r4,r9,ip}
tst r2, #0x30
bne 0b
1: tst r2, #0xf
beq 2f
lsls ip, r2, #29
ldmcs r1!, {r3,ip}
stmcs r0!, {r3,ip}
ldrmi r3, [r1],#4
strmi r3, [r0],#4
lsls ip, r2, #31
ldrhcs r3, [r1],#2
strhcs r3, [r0],#2
ldrbmi ip, [r1]
strbmi ip, [r0]
2: CLEAR_FRAME_AND_RETURN
/*****************************************************************************
* ascending copy, source buffer is not word aligned *
*****************************************************************************/
L_ascendingUnalignedCopy:
// Destination buffer is word aligned, but source buffer is not. Copy
// byte-by-byte until the destination buffer has eightbyte alignment.
subs r2, #4
blo L_ascendingUnalignedByteCleanup
0: tst r0, #0x7
beq L_ascendingUnalignedVectorCopy
ldrb r3, [r1],#1
subs r2, #1
strb r3, [r0],#1
bhs 0b
L_ascendingUnalignedByteCleanup:
adds r2, #8
beq 1f
0: ldrb r3, [r1],#1
subs r2, #1
strb r3, [r0],#1
bne 0b
1: CLEAR_FRAME_AND_RETURN
L_ascendingUnalignedVectorCopy:
// Destination buffer is eightbyte aligned. Source buffer has unknown
// alignment. Use NEON to handle the misaligned copies. We begin by copying
// up to 24 bytes to get cacheline alignment of the destination buffer.
subs r2, #0x18
blo L_ascendingUnalignedVectorCleanup
0: tst r0, #0x18
beq L_ascendingUnalignedCachelineCopy
vld1.8 {d0}, [r1]!
subs r2, #8
vst1.8 {d0}, [r0,:64]!
bhs 0b
L_ascendingUnalignedVectorCleanup:
adds r2, #0x18
blo L_ascendingUnalignedByteCleanup
0: vld1.8 {d0}, [r1]!
subs r2, #8
vst1.8 {d0}, [r0,:64]!
bhs 0b
b L_ascendingUnalignedByteCleanup
L_ascendingUnalignedCachelineCopy:
// Main copy loop vld1.8 {q0,q1},[r1]!
pld [r1, #0x60]
vst1.8 {q0,q1},[r0,:256]!
subs r2, #0x20
bhs L_ascendingUnalignedCachelineCopy
b L_ascendingUnalignedVectorCleanup
/*****************************************************************************
* descending copy *
*****************************************************************************/
// The layout of the two buffers is such that we must copy in descending-
// address order. Throughout this copy, registers are used as follows:
//
// r0 lowest address in the destination buffer that has been written to.
// r1 lowest address in the source buffer that has been read from.
// r2 number of bytes remaining to copy less an offset that varies
// with the size of the copies that are being made.
// r3, r4, r5, r6, r8, r9, r10, r12
// temporary registers used to hold the data during copies.
// r12 also used as a scratch register for alignment / length calculations
L_descendingCopy:
// We begin by checking if less than four bytes are to be copied// to three bytes if needed to make the destination pointer have word (four
// byte) alignment.
add r1, r2
add r0, r2
subs r2, #4
blo L_descendingLengthLessThanFour
ands ip, r0, #0x3
beq L_descendingDestinationWordAligned
ldrb r3, [r1, #-1]!
cmp ip, #2
ldrbhs r4, [r1, #-1]!
strb r3, [r0, #-1]!
ldrbhi r3, [r1, #-1]!
strbhs r4, [r0, #-1]!
strbhi r3, [r0, #-1]!
subs r2, ip
bhs L_descendingDestinationWordAligned
L_descendingLengthLessThanFour:
// Conditionally copies up to three bytes, assuming no alignment. This is
// only used if the original length of the buffer is smaller than four.
lsls ip, r2, #31
ldrbcs r3, [r1, #-1]!
ldrbcs ip, [r1, #-1]!
ldrbmi r4, [r1, #-1]
strbcs r3, [r0, #-1]!
strbcs ip, [r0, #-1]!
strbmi r4, [r0, #-1]
CLEAR_FRAME_AND_RETURN
L_descendingDestinationWordAligned:
// We know that the destination has word alignment. If the source is not
// similarly aligned, jump to an unaligned copy loop.
tst r1, #0x3
bne L_descendingUnalignedCopy
/*****************************************************************************
* descending copy, both buffers have word alignment *
*****************************************************************************/
// If less than sixty-four bytes remain to be copied, jump directly to the
// word-aligned cleanup path. Otherwise, we copy up to 28 bytes as needed
// to make the destination pointer have cacheline alignment.
subs r2, r2, #0x3c
blo L_descendingLengthLessThanSixtyFour
0: tst r0, #0x1c
beq L_descendingDestinationCachelineAligned
ldr r3, [r1, #-4]!
subs r2, #4
str r3, [r0, #-4]!
bhs 0b
b L_descendingLengthLessThanSixtyFour
L_descendingDestinationCachelineAligned:
// Unrolled main copy loop// though anything between -0x40 and -0x100 seems to be "acceptable".
push ADDITIONAL_CALLEE_SAVE_REGISTERS
0: ldmdb r1!, COPY_REGISTERS
subs r2, r2, #0x40
stmdb r0!, COPY_REGISTERS
pld [r1, #-0x80]
ldmdb r1!, COPY_REGISTERS
pld [r1, #-0x80]
stmdb r0!, COPY_REGISTERS
bhs 0b
pop ADDITIONAL_CALLEE_SAVE_REGISTERS
L_descendingLengthLessThanSixtyFour:
// Cleanup copy of up to 63 bytes. We can assume that both the source and
// destination addresses have word alignment here.
tst r2, #0x30
beq 1f
0: ldmdb r1!, {r3,r4,r9,ip}
sub r2, r2, #0x10
stmdb r0!, {r3,r4,r9,ip}
tst r2, #0x30
bne 0b
1: tst r2, #0xf
beq 2f
lsls ip, r2, #29
ldmdbcs r1!, {r3,ip}
stmdbcs r0!, {r3,ip}
ldrmi r3, [r1, #-4]!
strmi r3, [r0, #-4]!
lsls ip, r2, #31
ldrhcs r3, [r1, #-2]!
strhcs r3, [r0, #-2]!
ldrbmi ip, [r1, #-1]
strbmi ip, [r0, #-1]
2: CLEAR_FRAME_AND_RETURN
/*****************************************************************************
* descending copy, source buffer is not word aligned *
*****************************************************************************/
L_descendingUnalignedCopy:
// Destination buffer is word aligned, but source buffer is not. Copy
// byte-by-byte until the destination buffer has eightbyte alignment.
subs r2, #4
blo L_descendingUnalignedByteCleanup
0: tst r0, #0x7
beq L_descendingUnalignedVectorCopy
ldrb r3, [r1, #-1]!
subs r2, #1
strb r3, [r0, #-1]!
bhs 0b
L_descendingUnalignedByteCleanup:
adds r2, #8
beq 1f
0: ldrb r3, [r1, #-1]!
subs r2, #1
strb r3, [r0, #-1]!
bne 0b
1: CLEAR_FRAME_AND_RETURN
L_descendingUnalignedVectorCopy:
// Destination buffer is eightbyte aligned. Source buffer has unknown
// alignment. Use NEON to handle the misaligned copies. We begin by copying
// up to 24 bytes to get cacheline alignment of the destination buffer.
subs r2, #0x18
blo L_descendingUnalignedVectorCleanup
0: tst r0, #0x18
beq L_descendingUnalignedCachelineCopy
sub r1, #8
vld1.8 {d0}, [r1]
sub r0, #8
vst1.8 {d0}, [r0,:64]
subs r2, #8
bhs 0b
L_descendingUnalignedVectorCleanup:
adds r2, #0x18
blo L_descendingUnalignedByteCleanup
0: sub r1, #8
vld1.8 {d0}, [r1]
sub r0, #8
vst1.8 {d0}, [r0,:64]
subs r2, #8
bhs 0b
b L_descendingUnalignedByteCleanup
L_descendingUnalignedCachelineCopy:
// Main copy loop sub r1, #32
sub r0, #32
mov r4, #-32
0: vld1.8 {q0,q1},[r1], r4
pld [r1, #-0x60]
vst1.8 {q0,q1},[r0,:256], r4
subs r2, #0x20
bhs 0b
add r1, #32
add r0, #32
b L_descendingUnalignedVectorCleanup
#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD