/*
* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>
/*
* The bcopy/memcpy loops, tuned for Pentium-M class processors with
* Supplemental SSE3 and 64-byte cache lines.
*
* The following #defines are tightly coupled to the u-architecture:
*/
#define kShort 80 // too short to bother with SSE (must be >=80)
#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
// void bcopy(const void *src, void *dst, size_t len) .text
.align 5, 0x90
LZero:
Lbcopy_sse3x: // void bcopy(const void *src, void *dst, size_t len)
pushl %ebp // set up a frame for backtraces
movl %esp,%ebp
pushl %esi
pushl %edi
movl 8(%ebp),%esi // get source ptr
movl 12(%ebp),%edi // get dest ptr
movl 16(%ebp),%ecx // get length
movl %edi,%edx
subl %esi,%edx // (dest - source)
cmpl %ecx,%edx // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpl $(kShort),%ecx // long enough to bother with SSE?
jbe Lshort // no
jmp LNotShort
//
// void *memcpy(void *dst, const void *src, size_t len)//
// NB: These need to be 32 bytes from bcopy():
//
.align 5, 0x90
Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
pushl %ebp // set up a frame for backtraces
movl %esp,%ebp
pushl %esi
pushl %edi
movl 8(%ebp),%edi // get dest ptr
movl 12(%ebp),%esi // get source ptr
movl 16(%ebp),%ecx // get length
movl %edi,%edx
subl %esi,%edx // (dest - source)
cmpl %ecx,%edx // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpl $(kShort),%ecx // long enough to bother with SSE?
ja LNotShort // yes
// Handle short forward copies. As the most common case, this is the fall-through path.
// ecx = length (<= kShort)
// esi = source ptr
// edi = dest ptr
Lshort:
movl %ecx,%edx // copy length
shrl $2,%ecx // get #doublewords
jz LLeftovers
2: // loop copying doublewords
movl (%esi),%eax
addl $4,%esi
movl %eax,(%edi)
addl $4,%edi
dec %ecx
jnz 2b
LLeftovers: // handle leftover bytes (0..3) in last word
andl $3,%edx // any leftover bytes?
jz Lexit
4: // loop copying bytes
movb (%esi),%al
inc %esi
movb %al,(%edi)
inc %edi
dec %edx
jnz 4b
Lexit:
movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
popl %edi
popl %esi
popl %ebp
ret
LReverseIsland: // keep the "jb" above a short branch...
jmp LReverse // ...because reverse moves are uncommon
// Handle forward moves that are long enough to justify use of SSE3.
// First, 16-byte align the destination.
// ecx = length (> kShort)
// esi = source ptr
// edi = dest ptr
LNotShort:
cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
movl %edi,%edx // copy destination
jae LVeryLong // use very-long-operand path
negl %edx
andl $15,%edx // get #bytes to align destination
jz LDestAligned // already aligned
subl %edx,%ecx // decrement length
1: // loop copying 1..15 bytes
movb (%esi),%al
inc %esi
movb %al,(%edi)
inc %edi
dec %edx
jnz 1b
// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source. All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk. When we enter the copy loops, the following registers
// are set up:
// ecx = residual length (0..63)
// edx = -(length to move), a multiple of 64
// esi = ptr to 1st source byte not to move (unaligned)
// edi = ptr to 1st dest byte not to move (aligned)
LDestAligned:
movl %ecx,%edx // copy length
movl %esi,%eax // copy source address
andl $63,%ecx // get remaining bytes for Lshort
andl $-64,%edx // get number of bytes we will copy in inner loop
andl $15,%eax // mask to low 4 bits of source address
addl %edx,%esi // point to 1st byte not copied
addl %edx,%edi
negl %edx // now generate offset to 1st byte to be copied
movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
jmp *%eax
.align 2
LTable: // table of copy loop addresses
.long LMod0 + _COMM_PAGE_BCOPY - LZero
.long LMod1 + _COMM_PAGE_BCOPY - LZero
.long LMod2 + _COMM_PAGE_BCOPY - LZero
.long LMod3 + _COMM_PAGE_BCOPY - LZero
.long LMod4 + _COMM_PAGE_BCOPY - LZero
.long LMod5 + _COMM_PAGE_BCOPY - LZero
.long LMod6 + _COMM_PAGE_BCOPY - LZero
.long LMod7 + _COMM_PAGE_BCOPY - LZero
.long LMod8 + _COMM_PAGE_BCOPY - LZero
.long LMod9 + _COMM_PAGE_BCOPY - LZero
.long LMod10 + _COMM_PAGE_BCOPY - LZero
.long LMod11 + _COMM_PAGE_BCOPY - LZero
.long LMod12 + _COMM_PAGE_BCOPY - LZero
.long LMod13 + _COMM_PAGE_BCOPY - LZero
.long LMod14 + _COMM_PAGE_BCOPY - LZero
.long LMod15 + _COMM_PAGE_BCOPY - LZero
// Very long forward moves. These are at least several pages. They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark. There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere. We call
// the longcopy routine using the normal ABI.
LVeryLong:
pushl %ecx // length (>= kVeryLong)
pushl %esi // source ptr
pushl %edi // dest ptr
movl $(_COMM_PAGE_LONGCOPY),%eax
call *%eax // do the long copy
addl $12,%esp // pop off our parameters
jmp Lexit
// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.
Lfastpath:
addl %edx,%esi // restore ptrs to 1st byte of source and dest
addl %edx,%edi
negl %edx // make length positive
orl %edx,%ecx // restore total #bytes remaining to move
cld // we'll move forward
movl %ecx,%edx // copy total length to move
shrl $2,%ecx // compute #words to move
rep // the u-code will optimize this
movsl
jmp LLeftovers // handle 0..3 leftover bytes
// Forward loop for medium length operands in which low four bits of %esi == 0000
LMod0:
cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
jmp 1f
.align 4,0x90 // 16-byte align inner loops
1: // loop over 64-byte chunks
movdqa (%esi,%edx),%xmm0
movdqa 16(%esi,%edx),%xmm1
movdqa 32(%esi,%edx),%xmm2
movdqa 48(%esi,%edx),%xmm3
movdqa %xmm0,(%edi,%edx)
movdqa %xmm1,16(%edi,%edx)
movdqa %xmm2,32(%edi,%edx)
movdqa %xmm3,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0001
LMod1:
movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
1: // loop over 64-byte chunks
movdqa 15(%esi,%edx),%xmm1
movdqa 31(%esi,%edx),%xmm2
movdqa 47(%esi,%edx),%xmm3
movdqa 63(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $1,%xmm2,%xmm3
palignr $1,%xmm1,%xmm2
palignr $1,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0010
LMod2:
movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 14(%esi,%edx),%xmm1
movdqa 30(%esi,%edx),%xmm2
movdqa 46(%esi,%edx),%xmm3
movdqa 62(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $2,%xmm2,%xmm3
palignr $2,%xmm1,%xmm2
palignr $2,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0011
LMod3:
movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 13(%esi,%edx),%xmm1
movdqa 29(%esi,%edx),%xmm2
movdqa 45(%esi,%edx),%xmm3
movdqa 61(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $3,%xmm2,%xmm3
palignr $3,%xmm1,%xmm2
palignr $3,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0100
// We use the float single data type in order to use "movss" to merge vectors.
LMod4:
movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movaps 12(%esi,%edx),%xmm1
movaps 28(%esi,%edx),%xmm2
movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
movaps 44(%esi,%edx),%xmm3
movss %xmm2,%xmm1
pshufd $(0x39),%xmm1,%xmm1
movaps 60(%esi,%edx),%xmm4
movss %xmm3,%xmm2
pshufd $(0x39),%xmm2,%xmm2
movaps %xmm0,(%edi,%edx)
movss %xmm4,%xmm3
pshufd $(0x39),%xmm3,%xmm3
movaps %xmm1,16(%edi,%edx)
movaps %xmm2,32(%edi,%edx)
movaps %xmm4,%xmm0
movaps %xmm3,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0101
LMod5:
movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 11(%esi,%edx),%xmm1
movdqa 27(%esi,%edx),%xmm2
movdqa 43(%esi,%edx),%xmm3
movdqa 59(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $5,%xmm2,%xmm3
palignr $5,%xmm1,%xmm2
palignr $5,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0110
LMod6:
movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 10(%esi,%edx),%xmm1
movdqa 26(%esi,%edx),%xmm2
movdqa 42(%esi,%edx),%xmm3
movdqa 58(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $6,%xmm2,%xmm3
palignr $6,%xmm1,%xmm2
palignr $6,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 0111
LMod7:
movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 9(%esi,%edx),%xmm1
movdqa 25(%esi,%edx),%xmm2
movdqa 41(%esi,%edx),%xmm3
movdqa 57(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $7,%xmm2,%xmm3
palignr $7,%xmm1,%xmm2
palignr $7,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
LMod8:
cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movapd 8(%esi,%edx),%xmm1
movapd 24(%esi,%edx),%xmm2
shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
movapd 40(%esi,%edx),%xmm3
shufpd $01,%xmm2,%xmm1
movapd 56(%esi,%edx),%xmm4
shufpd $01,%xmm3,%xmm2
movapd %xmm0,(%edi,%edx)
shufpd $01,%xmm4,%xmm3
movapd %xmm1,16(%edi,%edx)
movapd %xmm2,32(%edi,%edx)
movapd %xmm4,%xmm0
movapd %xmm3,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1001
LMod9:
movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 7(%esi,%edx),%xmm1
movdqa 23(%esi,%edx),%xmm2
movdqa 39(%esi,%edx),%xmm3
movdqa 55(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $9,%xmm2,%xmm3
palignr $9,%xmm1,%xmm2
palignr $9,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1010
LMod10:
movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 6(%esi,%edx),%xmm1
movdqa 22(%esi,%edx),%xmm2
movdqa 38(%esi,%edx),%xmm3
movdqa 54(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $10,%xmm2,%xmm3
palignr $10,%xmm1,%xmm2
palignr $10,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1011
LMod11:
movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 5(%esi,%edx),%xmm1
movdqa 21(%esi,%edx),%xmm2
movdqa 37(%esi,%edx),%xmm3
movdqa 53(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $11,%xmm2,%xmm3
palignr $11,%xmm1,%xmm2
palignr $11,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1100
// We use the float single data type in order to use "movss" to merge vectors.
LMod12:
movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
pshufd $(0x93),20(%esi,%edx),%xmm2
pshufd $(0x93),36(%esi,%edx),%xmm3
pshufd $(0x93),52(%esi,%edx),%xmm4
movaps %xmm4,%xmm5
movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
movss %xmm2,%xmm3
movss %xmm1,%xmm2
movss %xmm0,%xmm1
movaps %xmm1,(%edi,%edx)
movaps %xmm2,16(%edi,%edx)
movaps %xmm5,%xmm0
movaps %xmm3,32(%edi,%edx)
movaps %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1101
LMod13:
movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 3(%esi,%edx),%xmm1
movdqa 19(%esi,%edx),%xmm2
movdqa 35(%esi,%edx),%xmm3
movdqa 51(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $13,%xmm2,%xmm3
palignr $13,%xmm1,%xmm2
palignr $13,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1110
LMod14:
movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 2(%esi,%edx),%xmm1
movdqa 18(%esi,%edx),%xmm2
movdqa 34(%esi,%edx),%xmm3
movdqa 50(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $14,%xmm2,%xmm3
palignr $14,%xmm1,%xmm2
palignr $14,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %esi == 1111
LMod15:
movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 1(%esi,%edx),%xmm1
movdqa 17(%esi,%edx),%xmm2
movdqa 33(%esi,%edx),%xmm3
movdqa 49(%esi,%edx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $15,%xmm2,%xmm3
palignr $15,%xmm1,%xmm2
palignr $15,%xmm5,%xmm1
movdqa %xmm1,(%edi,%edx)
movdqa %xmm2,16(%edi,%edx)
movdqa %xmm3,32(%edi,%edx)
movdqa %xmm4,48(%edi,%edx)
addl $64,%edx
jnz 1b
jmp Lshort // copy remaining 0..63 bytes and done
// Reverse moves. These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
// ecx = length
// esi = source ptr
// edi = dest ptr
LReverse:
addl %ecx,%esi // point to end of strings
addl %ecx,%edi
cmpl $(kShort),%ecx // long enough to bother with SSE?
ja LReverseNotShort // yes
// Handle reverse short copies.
// ecx = length
// esi = one byte past end of source
// edi = one byte past end of dest
LReverseShort:
movl %ecx,%edx // copy length
shrl $2,%ecx // #words
jz 3f
1:
subl $4,%esi
movl (%esi),%eax
subl $4,%edi
movl %eax,(%edi)
dec %ecx
jnz 1b
3:
andl $3,%edx // bytes?
jz 5f
4:
dec %esi
movb (%esi),%al
dec %edi
movb %al,(%edi)
dec %edx
jnz 4b
5:
movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
popl %edi
popl %esi
popl %ebp
ret
// Handle a reverse move long enough to justify using SSE.
// ecx = length
// esi = one byte past end of source
// edi = one byte past end of dest
LReverseNotShort:
movl %edi,%edx // copy destination
andl $15,%edx // get #bytes to align destination
je LReverseDestAligned // already aligned
subl %edx,%ecx // adjust length
1: // loop copying 1..15 bytes
dec %esi
movb (%esi),%al
dec %edi
movb %al,(%edi)
dec %edx
jnz 1b
// Destination is now aligned. Prepare for reverse loops.
LReverseDestAligned:
movl %ecx,%edx // copy length
andl $63,%ecx // get remaining bytes for Lshort
andl $-64,%edx // get number of bytes we will copy in inner loop
subl %edx,%esi // point to endpoint of copy
subl %edx,%edi
testl $15,%esi // is source aligned too?
jnz LReverseUnalignedLoop // no
LReverseAlignedLoop: // loop over 64-byte chunks
movdqa -16(%esi,%edx),%xmm0
movdqa -32(%esi,%edx),%xmm1
movdqa -48(%esi,%edx),%xmm2
movdqa -64(%esi,%edx),%xmm3
movdqa %xmm0,-16(%edi,%edx)
movdqa %xmm1,-32(%edi,%edx)
movdqa %xmm2,-48(%edi,%edx)
movdqa %xmm3,-64(%edi,%edx)
subl $64,%edx
jne LReverseAlignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
LReverseUnalignedLoop: // loop over 64-byte chunks
movdqu -16(%esi,%edx),%xmm0
movdqu -32(%esi,%edx),%xmm1
movdqu -48(%esi,%edx),%xmm2
movdqu -64(%esi,%edx),%xmm3
movdqa %xmm0,-16(%edi,%edx)
movdqa %xmm1,-32(%edi,%edx)
movdqa %xmm2,-48(%edi,%edx)
movdqa %xmm3,-64(%edi,%edx)
subl $64,%edx
jne LReverseUnalignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0)