/*
* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>
/*
* The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
* Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
*
* The following #defines are tightly coupled to the u-architecture:
*/
#define kShort 80 // too short to bother with SSE (must be >=80)
#define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB)
#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
// void bcopy(const void *src, void *dst, size_t len)COMMPAGE_FUNCTION_START(bcopy_sse3x_64, 64, 5)
LZero:
pushq %rbp // set up a frame for backtraces
movq %rsp,%rbp
movq %rsi,%rax // copy dest ptr
movq %rdi,%rsi // xchange source and dest ptrs
movq %rax,%rdi
subq %rsi,%rax // (dest - source)
cmpq %rdx,%rax // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpq $(kShort),%rdx // long enough to bother with SSE?
jbe LShort // no
jmp LNotShort
//
// void *memcpy(void *dst, const void *src, size_t len)//
// NB: These need to be 32 bytes from bcopy():
//
.align 5, 0x90
Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
pushq %rbp // set up a frame for backtraces
movq %rsp,%rbp
movq %rdi,%r11 // save return value here
movq %rdi,%rax
subq %rsi,%rax // (dest - source)
cmpq %rdx,%rax // must move in reverse if (dest - source) < length
jb LReverseIsland
cmpq $(kShort),%rdx // long enough to bother with SSE?
ja LNotShort // yes
// Handle short forward copies. As the most common case, this is the fall-through path.
// rdx = length (<= kShort)
// rsi = source ptr
// rdi = dest ptr
LShort:
movl %edx,%ecx // copy length using 32-bit operation
shrl $2,%ecx // get #doublewords
jz LLeftovers
2: // loop copying doublewords
movl (%rsi),%eax
addq $4,%rsi
movl %eax,(%rdi)
addq $4,%rdi
decl %ecx
jnz 2b
LLeftovers: // handle leftover bytes (0..3) in last word
andl $3,%edx // any leftover bytes?
jz 5f
4: // loop copying bytes
movb (%rsi),%al
incq %rsi
movb %al,(%rdi)
incq %rdi
decl %edx
jnz 4b
5:
movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
popq %rbp
ret
LReverseIsland: // keep the "jb" above a short branch...
jmp LReverse // ...because reverse moves are uncommon
// Handle forward moves that are long enough to justify use of SSE.
// First, 16-byte align the destination.
// rdx = length (> kShort)
// rsi = source ptr
// rdi = dest ptr
LNotShort:
cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops?
jae LVeryLong // use very-long-operand path
movl %edi,%ecx // copy low half of destination ptr
negl %ecx
andl $15,%ecx // get #bytes to align destination
jz LDestAligned // already aligned
subl %ecx,%edx // decrement length
rep // align destination
movsb
// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source. All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk. When we enter the copy loops, the following registers
// are set up:
// rdx = residual length (0..63)
// rcx = -(length to move), a multiple of 64 less than 2GB
// rsi = ptr to 1st source byte not to move (unaligned)
// rdi = ptr to 1st dest byte not to move (aligned)
LDestAligned:
movq %rdx,%rcx // copy length
movl %esi,%eax // copy low half of source address
andl $63,%edx // get remaining bytes for LShort
andl $15,%eax // mask to low 4 bits of source address
andq $-64,%rcx // get number of bytes we will copy in inner loop
// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block in
// a 32-bit object file (4586528). Generate the leaq opcode manually.
#if defined(__i386__)
.byte 0x4c
.byte 0x8d
.byte 0x05
.long LTable-LRIP
LRIP:
#elif defined(__x86_64__)
leaq LTable(%rip), %r8
#else
#error Unsupported architecture
#endif
addq %rcx,%rsi // point to 1st byte not copied
addq %rcx,%rdi
movl (%r8,%rax,4),%eax // get offset of routine
negq %rcx // now generate offset to 1st byte to be copied
addq %r8,%rax // generate address of copy loop
jmp *%rax // enter copy loop, selected by source alignment
.align 2
LTable: // table of copy loop addresses
// force generation of assembly-time constants. Otherwise assembler
// creates subtractor relocations relative to first external symbol,
// and this file has none
.set LMod0Offset, LMod0 - LTable
.set LMod1Offset, LMod1 - LTable
.set LMod2Offset, LMod2 - LTable
.set LMod3Offset, LMod3 - LTable
.set LMod4Offset, LMod4 - LTable
.set LMod5Offset, LMod5 - LTable
.set LMod6Offset, LMod6 - LTable
.set LMod7Offset, LMod7 - LTable
.set LMod8Offset, LMod8 - LTable
.set LMod9Offset, LMod9 - LTable
.set LMod10Offset, LMod10 - LTable
.set LMod11Offset, LMod11 - LTable
.set LMod12Offset, LMod12 - LTable
.set LMod13Offset, LMod13 - LTable
.set LMod14Offset, LMod14 - LTable
.set LMod15Offset, LMod15 - LTable
.long LMod0Offset
.long LMod1Offset
.long LMod2Offset
.long LMod3Offset
.long LMod4Offset
.long LMod5Offset
.long LMod6Offset
.long LMod7Offset
.long LMod8Offset
.long LMod9Offset
.long LMod10Offset
.long LMod11Offset
.long LMod12Offset
.long LMod13Offset
.long LMod14Offset
.long LMod15Offset
// Very long forward moves. These are at least several pages. They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark. There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere. We call
// the longcopy routine using the normal ABI:
// rdi = dest
// rsi = source
// rdx = length (>= kVeryLong bytes)
LVeryLong:
pushq %r11 // save return value
movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
call *%rax // call very long operand routine
popq %rax // pop return value
popq %rbp
ret
// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.
Lfastpath:
addq %rcx,%rsi // restore ptrs to 1st byte of source and dest
addq %rcx,%rdi
negl %ecx // make length positive (known to be < 2GB)
orl %edx,%ecx // restore total #bytes remaining to move
cld // we'll move forward
shrl $2,%ecx // compute #words to move
rep // the u-code will optimize this
movsl
jmp LLeftovers // handle 0..3 leftover bytes
// Forward loop for medium length operands in which low four bits of %rsi == 0000
LMod0:
cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
jmp 1f
.align 4,0x90 // 16-byte align inner loops
1: // loop over 64-byte chunks
movdqa (%rsi,%rcx),%xmm0
movdqa 16(%rsi,%rcx),%xmm1
movdqa 32(%rsi,%rcx),%xmm2
movdqa 48(%rsi,%rcx),%xmm3
movdqa %xmm0,(%rdi,%rcx)
movdqa %xmm1,16(%rdi,%rcx)
movdqa %xmm2,32(%rdi,%rcx)
movdqa %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0001
LMod1:
movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword
1: // loop over 64-byte chunks
movdqa 15(%rsi,%rcx),%xmm1
movdqa 31(%rsi,%rcx),%xmm2
movdqa 47(%rsi,%rcx),%xmm3
movdqa 63(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $1,%xmm2,%xmm3
palignr $1,%xmm1,%xmm2
palignr $1,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0010
LMod2:
movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 14(%rsi,%rcx),%xmm1
movdqa 30(%rsi,%rcx),%xmm2
movdqa 46(%rsi,%rcx),%xmm3
movdqa 62(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $2,%xmm2,%xmm3
palignr $2,%xmm1,%xmm2
palignr $2,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0011
LMod3:
movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 13(%rsi,%rcx),%xmm1
movdqa 29(%rsi,%rcx),%xmm2
movdqa 45(%rsi,%rcx),%xmm3
movdqa 61(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $3,%xmm2,%xmm3
palignr $3,%xmm1,%xmm2
palignr $3,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0100
// We use the float single data type in order to use "movss" to merge vectors.
LMod4:
movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movaps 12(%rsi,%rcx),%xmm1
movaps 28(%rsi,%rcx),%xmm2
movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
movaps 44(%rsi,%rcx),%xmm3
movss %xmm2,%xmm1
pshufd $(0x39),%xmm1,%xmm1
movaps 60(%rsi,%rcx),%xmm4
movss %xmm3,%xmm2
pshufd $(0x39),%xmm2,%xmm2
movaps %xmm0,(%rdi,%rcx)
movss %xmm4,%xmm3
pshufd $(0x39),%xmm3,%xmm3
movaps %xmm1,16(%rdi,%rcx)
movaps %xmm2,32(%rdi,%rcx)
movaps %xmm4,%xmm0
movaps %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0101
LMod5:
movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 11(%rsi,%rcx),%xmm1
movdqa 27(%rsi,%rcx),%xmm2
movdqa 43(%rsi,%rcx),%xmm3
movdqa 59(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $5,%xmm2,%xmm3
palignr $5,%xmm1,%xmm2
palignr $5,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0110
LMod6:
movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 10(%rsi,%rcx),%xmm1
movdqa 26(%rsi,%rcx),%xmm2
movdqa 42(%rsi,%rcx),%xmm3
movdqa 58(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $6,%xmm2,%xmm3
palignr $6,%xmm1,%xmm2
palignr $6,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 0111
LMod7:
movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 9(%rsi,%rcx),%xmm1
movdqa 25(%rsi,%rcx),%xmm2
movdqa 41(%rsi,%rcx),%xmm3
movdqa 57(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $7,%xmm2,%xmm3
palignr $7,%xmm1,%xmm2
palignr $7,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
LMod8:
cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong)
jle Lfastpath // long enough for fastpath in microcode
movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
movapd 8(%rsi,%rcx),%xmm1
movapd 24(%rsi,%rcx),%xmm2
shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
movapd 40(%rsi,%rcx),%xmm3
shufpd $01,%xmm2,%xmm1
movapd 56(%rsi,%rcx),%xmm4
shufpd $01,%xmm3,%xmm2
movapd %xmm0,(%rdi,%rcx)
shufpd $01,%xmm4,%xmm3
movapd %xmm1,16(%rdi,%rcx)
movapd %xmm2,32(%rdi,%rcx)
movapd %xmm4,%xmm0
movapd %xmm3,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1001
LMod9:
movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 7(%rsi,%rcx),%xmm1
movdqa 23(%rsi,%rcx),%xmm2
movdqa 39(%rsi,%rcx),%xmm3
movdqa 55(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $9,%xmm2,%xmm3
palignr $9,%xmm1,%xmm2
palignr $9,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1010
LMod10:
movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 6(%rsi,%rcx),%xmm1
movdqa 22(%rsi,%rcx),%xmm2
movdqa 38(%rsi,%rcx),%xmm3
movdqa 54(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $10,%xmm2,%xmm3
palignr $10,%xmm1,%xmm2
palignr $10,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1011
LMod11:
movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 5(%rsi,%rcx),%xmm1
movdqa 21(%rsi,%rcx),%xmm2
movdqa 37(%rsi,%rcx),%xmm3
movdqa 53(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $11,%xmm2,%xmm3
palignr $11,%xmm1,%xmm2
palignr $11,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1100
// We use the float single data type in order to use "movss" to merge vectors.
LMod12:
movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified
jmp 1f
.align 4,0x90
1: // loop over 64-byte chunks
pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
pshufd $(0x93),20(%rsi,%rcx),%xmm2
pshufd $(0x93),36(%rsi,%rcx),%xmm3
pshufd $(0x93),52(%rsi,%rcx),%xmm4
movaps %xmm4,%xmm5
movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
movss %xmm2,%xmm3
movss %xmm1,%xmm2
movss %xmm0,%xmm1
movaps %xmm1,(%rdi,%rcx)
movaps %xmm2,16(%rdi,%rcx)
movaps %xmm5,%xmm0
movaps %xmm3,32(%rdi,%rcx)
movaps %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1101
LMod13:
movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 3(%rsi,%rcx),%xmm1
movdqa 19(%rsi,%rcx),%xmm2
movdqa 35(%rsi,%rcx),%xmm3
movdqa 51(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $13,%xmm2,%xmm3
palignr $13,%xmm1,%xmm2
palignr $13,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1110
LMod14:
movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 2(%rsi,%rcx),%xmm1
movdqa 18(%rsi,%rcx),%xmm2
movdqa 34(%rsi,%rcx),%xmm3
movdqa 50(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $14,%xmm2,%xmm3
palignr $14,%xmm1,%xmm2
palignr $14,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Forward loop for medium length operands in which low four bits of %rsi == 1111
LMod15:
movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq
1: // loop over 64-byte chunks
movdqa 1(%rsi,%rcx),%xmm1
movdqa 17(%rsi,%rcx),%xmm2
movdqa 33(%rsi,%rcx),%xmm3
movdqa 49(%rsi,%rcx),%xmm4
movdqa %xmm0,%xmm5
movdqa %xmm4,%xmm0
palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
palignr $15,%xmm2,%xmm3
palignr $15,%xmm1,%xmm2
palignr $15,%xmm5,%xmm1
movdqa %xmm1,(%rdi,%rcx)
movdqa %xmm2,16(%rdi,%rcx)
movdqa %xmm3,32(%rdi,%rcx)
movdqa %xmm4,48(%rdi,%rcx)
addq $64,%rcx
jnz 1b
jmp LShort // copy remaining 0..63 bytes and done
// Reverse moves. These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
// rdx = length
// rsi = source ptr
// rdi = dest ptr
LReverse:
addq %rdx,%rsi // point to end of strings
addq %rdx,%rdi
cmpq $(kShort),%rdx // long enough to bother with SSE?
ja LReverseNotShort // yes
// Handle reverse short copies.
// edx = length (<= kShort)
// rsi = one byte past end of source
// rdi = one byte past end of dest
LReverseShort:
movl %edx,%ecx // copy length
shrl $3,%ecx // #quadwords
jz 3f
1:
subq $8,%rsi
movq (%rsi),%rax
subq $8,%rdi
movq %rax,(%rdi)
decl %ecx
jnz 1b
3:
andl $7,%edx // bytes?
jz 5f
4:
decq %rsi
movb (%rsi),%al
decq %rdi
movb %al,(%rdi)
decl %edx
jnz 4b
5:
movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
popq %rbp
ret
// Handle a reverse move long enough to justify using SSE.
// rdx = length (> kShort)
// rsi = one byte past end of source
// rdi = one byte past end of dest
LReverseNotShort:
movl %edi,%ecx // copy destination
andl $15,%ecx // get #bytes to align destination
je LReverseDestAligned // already aligned
subq %rcx,%rdx // adjust length
1: // loop copying 1..15 bytes
decq %rsi
movb (%rsi),%al
decq %rdi
movb %al,(%rdi)
decl %ecx
jnz 1b
// Destination is now aligned. Prepare for reverse loops.
LReverseDestAligned:
movq %rdx,%rcx // copy length
andl $63,%edx // get remaining bytes for LReverseShort
andq $-64,%rcx // get number of bytes we will copy in inner loop
subq %rcx,%rsi // point to endpoint of copy
subq %rcx,%rdi
testl $15,%esi // is source aligned too?
jnz LReverseUnalignedLoop // no
LReverseAlignedLoop: // loop over 64-byte chunks
movdqa -16(%rsi,%rcx),%xmm0
movdqa -32(%rsi,%rcx),%xmm1
movdqa -48(%rsi,%rcx),%xmm2
movdqa -64(%rsi,%rcx),%xmm3
movdqa %xmm0,-16(%rdi,%rcx)
movdqa %xmm1,-32(%rdi,%rcx)
movdqa %xmm2,-48(%rdi,%rcx)
movdqa %xmm3,-64(%rdi,%rcx)
subq $64,%rcx
jne LReverseAlignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
LReverseUnalignedLoop: // loop over 64-byte chunks
movdqu -16(%rsi,%rcx),%xmm0
movdqu -32(%rsi,%rcx),%xmm1
movdqu -48(%rsi,%rcx),%xmm2
movdqu -64(%rsi,%rcx),%xmm3
movdqa %xmm0,-16(%rdi,%rcx)
movdqa %xmm1,-32(%rdi,%rcx)
movdqa %xmm2,-48(%rdi,%rcx)
movdqa %xmm3,-64(%rdi,%rcx)
subq $64,%rcx
jne LReverseUnalignedLoop
jmp LReverseShort // copy remaining 0..63 bytes and done
COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)