/*
* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* The contents of this file constitute Original Code as defined in and
* are subject to the Apple Public Source License Version 1.1 (the
* "License"). You may not use this file except in compliance with the
* License. Please obtain a copy of the License at
* http://www.apple.com/publicsource and read it before using this file.
*
* This Original Code and all software distributed under the License are
* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
* License for the specific language governing rights and limitations
* under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>
/*
* The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
* SSE4 and 64-byte cache lines. This is the 64-bit version.
*
* The following #defines are tightly coupled to the u-architecture:
*/
#define kShort 80 // too short to bother with SSE (must be >=80)
#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
// void bcopy(const void *src, void *dst, size_t len) .text
.code64
.align 5, 0x90
LZero:
Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len)
pushq %rbp // set up a frame for backtraces
movq %rsp,%rbp
movq %rsi,%rax // copy dest ptr
movq %rdi,%rsi // xchange source and dest ptrs
movq %rax,%rdi
subq %rsi,%rax // (dest - source)
cmpq %rdx,%rax // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpq $(kShort),%rdx // long enough to bother with SSE?
jbe LShort // no
jmp LNotShort
//
// void *memcpy(void *dst, const void *src, size_t len)//
// NB: These need to be 32 bytes from bcopy():
//
.align 5, 0x90
Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
pushq %rbp // set up a frame for backtraces
movq %rsp,%rbp
movq %rdi,%r11 // save return value here
movq %rdi,%rax
subq %rsi,%rax // (dest - source)
cmpq %rdx,%rax // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpq $(kShort),%rdx // long enough to bother with SSE?
ja LNotShort // yes
// Handle short forward copies. As the most common case, this is the fall-through path.
// rdx = length (<= kShort)
// rsi = source ptr
// rdi = dest ptr
LShort:
movl %edx,%ecx // copy length using 32-bit operation
shrl $2,%ecx // get #doublewords
jz LLeftovers
2: // loop copying doublewords
movl (%rsi),%eax
addq $4,%rsi
movl %eax,(%rdi)
addq $4,%rdi
decl %ecx
jnz 2b
LLeftovers: // handle leftover bytes (0..3) in last word
andl $3,%edx // any leftover bytes?
jz 5f
4: // loop copying bytes
movb (%rsi),%al
incq %rsi
movb %al,(%rdi)
incq %rdi
decl %edx
jnz 4b
5:
movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
popq %rbp
ret
LReverseIsland: // keep the "jb" above a short branch...
jmp LReverse // ...because reverse moves are uncommon
// Handle forward moves that are long enough to justify use of SSE.
// First, 16-byte align the destination.
// rdx = length (> kShort)
// rsi = source ptr
// rdi = dest ptr
LNotShort:
cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
jae LVeryLong // use very-long-operand path
movl %edi,%ecx // copy low half of destination ptr
negl %ecx
andl $15,%ecx // get #bytes to align destination
jz LDestAligned // already aligned
subl %ecx,%edx // decrement length
rep // align destination
movsb
// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source. All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk. When we enter the copy loops, the following registers
// are set up:
// rdx = residual length (0..63)
// rcx = -(length to move), a multiple of 64 less than 2GB
// rsi = ptr to 1st source byte not to move (unaligned)
// rdi = ptr to 1st dest byte not to move (aligned)
LDestAligned:
movl %edx,%ecx // copy length
movl %esi,%eax // copy low half of source address
andl $63,%edx // get remaining bytes for LShort
andl $15,%eax // mask to low 4 bits of source address
andl $-64,%ecx // get number of bytes we will copy in inner loop
// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block.
// lea LTable(%rip),%r8 // point to dispatch table
movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528
addq $(LTable-LZero),%r8 // work around 4586528
addq %rcx,%rsi // point to 1st byte not copied
addq %rcx,%rdi
movl (%r8,%rax,4),%eax // get offset of routine
negq %rcx // now generate offset to 1st byte to be copied
addq %r8,%rax // generate address of copy loop
jmp *%rax // enter copy loop, selected by source alignment
.align 2
LTable: // table of copy loop addresses
.long (LMod0 - LTable)
.long (LMod1 - LTable)
.long (LMod2 - LTable)
.long (LMod3 - LTable)
.long (LMod4 - LTable)
.long (LMod5 - LTable)
.long (LMod6 - LTable)
.long (LMod7 - LTable)
.long (LMod8 - LTable)
.long (LMod9 - LTable)
.long (LMod10 - LTable)
.long (LMod11 - LTable)
.long (LMod12 - LTable)
.long (LMod13 - LTable)
.long (LMod14 - LTable)
.long (LMod15 - LTable)
// Very long forward moves. These are at least several pages. They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark. There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere. We call
// the longcopy routine using the normal ABI:
// rdi = dest
// rsi = source
// rdx = length (>= kVeryLong bytes)
LVeryLong:
pushq %r11 // save return value
movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
call *%rax // call very long operand routine
popq %rax // pop return value
popq %rbp
ret
// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.
Lfastpath:
addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
addq %rcx,%rdi
negl %ecx // make length positive (known to be < 2GB)
orl %edx,%ecx // restore total #bytes remaining to move
cld // we'll move forward
shrl $2,%ecx // compute #words to move
rep // the u-code will optimize this
movsl
jmp LLeftovers // handle 0..3 leftover bytes
// Forward loop for medium length operands in which low four bits of %rsi == 0000
LMod0:
cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
jmp 1f
.align 4,0x90 // 16-byte align inner loops
1: // loop over 64-byte chunks
movdqa (%rsi,%rcx),%xmm0
movdqa 16(%rsi,%rcx),%xmm1
movdqa 32(%rsi,%rcx),%xmm2
movdqa 48(%rsi,%rcx),%xmm3
movdqa %xmm0,(%rdi,%rcx)
movdqa %xmm1,16(%rdi,%rcx)
movdqa %xmm2,32(%rdi,%rcx)
movdqa %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0001
LMod1:
movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
1: // loop over 64-byte chunks
movdqa 15(%rsi,%rcx),%xmm1
movdqa 31(%rsi,%rcx),%xmm2
movdqa 47(%rsi,%rcx),%xmm3
movdqa 63(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $1,%xmm2,%xmm3
palignr $1,%xmm1,%xmm2
palignr $1,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0010
LMod2:
movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 14(%rsi,%rcx),%xmm1
movdqa 30(%rsi,%rcx),%xmm2
movdqa 46(%rsi,%rcx),%xmm3
movdqa 62(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $2,%xmm2,%xmm3
palignr $2,%xmm1,%xmm2
palignr $2,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0011
LMod3:
movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 13(%rsi,%rcx),%xmm1
movdqa 29(%rsi,%rcx),%xmm2
movdqa 45(%rsi,%rcx),%xmm3
movdqa 61(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $3,%xmm2,%xmm3
palignr $3,%xmm1,%xmm2
palignr $3,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0100
// We use the float single data type in order to use "movss" to merge vectors.
LMod4:
movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movaps 12(%rsi,%rcx),%xmm1
movaps 28(%rsi,%rcx),%xmm2
movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
movaps 44(%rsi,%rcx),%xmm3
movss %xmm2,%xmm1
pshufd $(0x39),%xmm1,%xmm1
movaps 60(%rsi,%rcx),%xmm4
movss %xmm3,%xmm2
pshufd $(0x39),%xmm2,%xmm2
movaps %xmm0,(%rdi,%rcx)
movss %xmm4,%xmm3
pshufd $(0x39),%xmm3,%xmm3
movaps %xmm1,16(%rdi,%rcx)
movaps %xmm2,32(%rdi,%rcx)
movaps %xmm4,%xmm0
movaps %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0101
LMod5:
movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 11(%rsi,%rcx),%xmm1
movdqa 27(%rsi,%rcx),%xmm2
movdqa 43(%rsi,%rcx),%xmm3
movdqa 59(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $5,%xmm2,%xmm3
palignr $5,%xmm1,%xmm2
palignr $5,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0110
LMod6:
movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 10(%rsi,%rcx),%xmm1
movdqa 26(%rsi,%rcx),%xmm2
movdqa 42(%rsi,%rcx),%xmm3
movdqa 58(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $6,%xmm2,%xmm3
palignr $6,%xmm1,%xmm2
palignr $6,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0111
LMod7:
movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 9(%rsi,%rcx),%xmm1
movdqa 25(%rsi,%rcx),%xmm2
movdqa 41(%rsi,%rcx),%xmm3
movdqa 57(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $7,%xmm2,%xmm3
palignr $7,%xmm1,%xmm2
palignr $7,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
LMod8:
cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movapd 8(%rsi,%rcx),%xmm1
movapd 24(%rsi,%rcx),%xmm2
shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
movapd 40(%rsi,%rcx),%xmm3
shufpd $01,%xmm2,%xmm1
movapd 56(%rsi,%rcx),%xmm4
shufpd $01,%xmm3,%xmm2
movapd %xmm0,(%rdi,%rcx)
shufpd $01,%xmm4,%xmm3
movapd %xmm1,16(%rdi,%rcx)
movapd %xmm2,32(%rdi,%rcx)
movapd %xmm4,%xmm0
movapd %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1001
LMod9:
movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 7(%rsi,%rcx),%xmm1
movdqa 23(%rsi,%rcx),%xmm2
movdqa 39(%rsi,%rcx),%xmm3
movdqa 55(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $9,%xmm2,%xmm3
palignr $9,%xmm1,%xmm2
palignr $9,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1010
LMod10:
movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 6(%rsi,%rcx),%xmm1
movdqa 22(%rsi,%rcx),%xmm2
movdqa 38(%rsi,%rcx),%xmm3
movdqa 54(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $10,%xmm2,%xmm3
palignr $10,%xmm1,%xmm2
palignr $10,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1011
LMod11:
movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 5(%rsi,%rcx),%xmm1
movdqa 21(%rsi,%rcx),%xmm2
movdqa 37(%rsi,%rcx),%xmm3
movdqa 53(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $11,%xmm2,%xmm3
palignr $11,%xmm1,%xmm2
palignr $11,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1100
// We use the float single data type in order to use "movss" to merge vectors.
LMod12:
movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
pshufd $(0x93),20(%rsi,%rcx),%xmm2
pshufd $(0x93),36(%rsi,%rcx),%xmm3
pshufd $(0x93),52(%rsi,%rcx),%xmm4
movaps %xmm4,%xmm5
movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
movss %xmm2,%xmm3
movss %xmm1,%xmm2
movss %xmm0,%xmm1
movaps %xmm1,(%rdi,%rcx)
movaps %xmm2,16(%rdi,%rcx)
movaps %xmm5,%xmm0
movaps %xmm3,32(%rdi,%rcx)
movaps %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1101
LMod13:
movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 3(%rsi,%rcx),%xmm1
movdqa 19(%rsi,%rcx),%xmm2
movdqa 35(%rsi,%rcx),%xmm3
movdqa 51(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $13,%xmm2,%xmm3
palignr $13,%xmm1,%xmm2
palignr $13,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1110
LMod14:
movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 2(%rsi,%rcx),%xmm1
movdqa 18(%rsi,%rcx),%xmm2
movdqa 34(%rsi,%rcx),%xmm3
movdqa 50(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $14,%xmm2,%xmm3
palignr $14,%xmm1,%xmm2
palignr $14,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1111
LMod15:
movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 1(%rsi,%rcx),%xmm1
movdqa 17(%rsi,%rcx),%xmm2
movdqa 33(%rsi,%rcx),%xmm3
movdqa 49(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $15,%xmm2,%xmm3
palignr $15,%xmm1,%xmm2
palignr $15,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Reverse moves. These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
// rdx = length
// rsi = source ptr
// rdi = dest ptr
LReverse:
addq %rdx,%rsi // point to end of strings
addq %rdx,%rdi
cmpq $(kShort),%rdx // long enough to bother with SSE?
ja LReverseNotShort // yes
// Handle reverse short copies.
// edx = length (<= kShort)
// rsi = one byte past end of source
// rdi = one byte past end of dest
LReverseShort:
movl %edx,%ecx // copy length
shrl $3,%ecx // #quadwords
jz 3f
1:
subq $8,%rsi
movq (%rsi),%rax
subq $8,%rdi
movq %rax,(%rdi)
decl %ecx
jnz 1b
3:
andl $7,%edx // bytes?
jz 5f
4:
decq %rsi
movb (%rsi),%al
decq %rdi
movb %al,(%rdi)
decl %edx
jnz 4b
5:
movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
popq %rbp
ret
// Handle a reverse move long enough to justify using SSE.
// rdx = length (> kShort)
// rsi = one byte past end of source
// rdi = one byte past end of dest
LReverseNotShort:
movl %edi,%ecx // copy destination
andl $15,%ecx // get #bytes to align destination
je LReverseDestAligned // already aligned
subq %rcx,%rdx // adjust length
1: // loop copying 1..15 bytes
decq %rsi
movb (%rsi),%al
decq %rdi
movb %al,(%rdi)
decl %ecx
jnz 1b
// Destination is now aligned. Prepare for reverse loops.
LReverseDestAligned:
movq %rdx,%rcx // copy length
andl $63,%edx // get remaining bytes for LReverseShort
andq $-64,%rcx // get number of bytes we will copy in inner loop
subq %rcx,%rsi // point to endpoint of copy
subq %rcx,%rdi
testl $15,%esi // is source aligned too?
jnz LReverseUnalignedLoop // no
LReverseAlignedLoop: // loop over 64-byte chunks
movdqa -16(%rsi,%rcx),%xmm0
movdqa -32(%rsi,%rcx),%xmm1
movdqa -48(%rsi,%rcx),%xmm2
movdqa -64(%rsi,%rcx),%xmm3
movdqa %xmm0,-16(%rdi,%rcx)
movdqa %xmm1,-32(%rdi,%rcx)
movdqa %xmm2,-48(%rdi,%rcx)
movdqa %xmm3,-64(%rdi,%rcx)
subq $64,%rcx
jne LReverseAlignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
LReverseUnalignedLoop: // loop over 64-byte chunks
movdqu -16(%rsi,%rcx),%xmm0
movdqu -32(%rsi,%rcx),%xmm1
movdqu -48(%rsi,%rcx),%xmm2
movdqu -64(%rsi,%rcx),%xmm3
movdqa %xmm0,-16(%rdi,%rcx)
movdqa %xmm1,-32(%rdi,%rcx)
movdqa %xmm2,-48(%rdi,%rcx)
movdqa %xmm3,-64(%rdi,%rcx)
subq $64,%rcx
jne LReverseUnalignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)