bcopy_sse3x_64.s [plain text]

/*
 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
        
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/*
 * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with 
 * Supplemental SSE3 and 64-byte cache lines.  This is the 64-bit version.
 *
 * The following #defines are tightly coupled to the u-architecture:
 */

#define kShort  80			// too short to bother with SSE (must be >=80)
#define kVeryLong   (500*1024)          // large enough for non-temporal stores (>=8192 and <2GB)
#define kFastUCode  ((16*1024)-15)	// cutoff for microcode fastpath for "rep/movsl"

// void bcopy(const void *src, void *dst, size_t len);
 
COMMPAGE_FUNCTION_START(bcopy_sse3x_64, 64, 5)
LZero:
	pushq	%rbp			// set up a frame for backtraces
	movq	%rsp,%rbp
	movq	%rsi,%rax		// copy dest ptr
	movq	%rdi,%rsi		// xchange source and dest ptrs
	movq	%rax,%rdi
        subq    %rsi,%rax               // (dest - source)
        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
        jbe     LShort			// no
	jmp	LNotShort

//
// void *memcpy(void *dst, const void *src, size_t len);
// void *memmove(void *dst, const void *src, size_t len);
//
// NB: These need to be 32 bytes from bcopy():
//

        .align	5, 0x90
Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
	pushq	%rbp			// set up a frame for backtraces
	movq	%rsp,%rbp
	movq	%rdi,%r11		// save return value here        
        movq    %rdi,%rax
        subq    %rsi,%rax               // (dest - source)
        cmpq    %rdx,%rax               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
        ja      LNotShort               // yes
        
// Handle short forward copies.  As the most common case, this is the fall-through path.
//      rdx = length (<= kShort)
//      rsi = source ptr
//      rdi = dest ptr

LShort:
	movl    %edx,%ecx		// copy length using 32-bit operation
	shrl	$2,%ecx			// get #doublewords
	jz	LLeftovers
2:					// loop copying doublewords
	movl	(%rsi),%eax
	addq	$4,%rsi
	movl	%eax,(%rdi)
	addq	$4,%rdi
	decl	%ecx
	jnz	2b
LLeftovers:				// handle leftover bytes (0..3) in last word
	andl	$3,%edx			// any leftover bytes?
	jz	5f
4:					// loop copying bytes
	movb	(%rsi),%al
	incq	%rsi
	movb	%al,(%rdi)
	incq	%rdi
	decl	%edx
	jnz	4b
5:
        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
	popq	%rbp
        ret


LReverseIsland:				// keep the "jb" above a short branch...
	jmp	LReverse		// ...because reverse moves are uncommon


// Handle forward moves that are long enough to justify use of SSE.
// First, 16-byte align the destination.
//      rdx = length (> kShort)
//      rsi = source ptr
//      rdi = dest ptr

LNotShort:
        cmpq    $(kVeryLong),%rdx       // long enough to justify heavyweight loops?
        jae     LVeryLong		// use very-long-operand path
        movl    %edi,%ecx               // copy low half of destination ptr
        negl    %ecx
        andl    $15,%ecx                // get #bytes to align destination
	jz	LDestAligned		// already aligned
        subl    %ecx,%edx               // decrement length
	rep				// align destination
	movsb

        
// Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source.  All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk.  When we enter the copy loops, the following registers
// are set up:
//      rdx = residual length (0..63)
//	rcx = -(length to move), a multiple of 64 less than 2GB
//      rsi = ptr to 1st source byte not to move (unaligned)
//      rdi = ptr to 1st dest byte not to move (aligned)

LDestAligned:
        movq    %rdx,%rcx               // copy length
	movl	%esi,%eax		// copy low half of source address
        andl    $63,%edx                // get remaining bytes for LShort
	andl	$15,%eax		// mask to low 4 bits of source address
        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
// We'd like to use lea with rip-relative addressing, but cannot in a .code64 block in
// a 32-bit object file (4586528). Generate the leaq opcode manually.
#if defined(__i386__)
        .byte 0x4c
        .byte 0x8d
        .byte 0x05
        .long LTable-LRIP
LRIP:	
#elif defined(__x86_64__)
        leaq	LTable(%rip), %r8
#else
#error Unsupported architecture
#endif
        addq    %rcx,%rsi               // point to 1st byte not copied
        addq    %rcx,%rdi
	movl	(%r8,%rax,4),%eax	// get offset of routine
        negq    %rcx                    // now generate offset to 1st byte to be copied
	addq	%r8,%rax		// generate address of copy loop
	jmp	*%rax			// enter copy loop, selected by source alignment

	.align	2
LTable:					// table of copy loop addresses
// force generation of assembly-time constants. Otherwise assembler
// creates subtractor relocations relative to first external symbol,
// and this file has none
.set LMod0Offset, LMod0 - LTable
.set LMod1Offset, LMod1 - LTable
.set LMod2Offset, LMod2 - LTable
.set LMod3Offset, LMod3 - LTable
.set LMod4Offset, LMod4 - LTable
.set LMod5Offset, LMod5 - LTable
.set LMod6Offset, LMod6 - LTable
.set LMod7Offset, LMod7 - LTable
.set LMod8Offset, LMod8 - LTable
.set LMod9Offset, LMod9 - LTable
.set LMod10Offset, LMod10 - LTable
.set LMod11Offset, LMod11 - LTable
.set LMod12Offset, LMod12 - LTable
.set LMod13Offset, LMod13 - LTable
.set LMod14Offset, LMod14 - LTable
.set LMod15Offset, LMod15 - LTable
	.long LMod0Offset
	.long LMod1Offset
	.long LMod2Offset
	.long LMod3Offset
	.long LMod4Offset
	.long LMod5Offset
	.long LMod6Offset
	.long LMod7Offset
	.long LMod8Offset
	.long LMod9Offset
	.long LMod10Offset
	.long LMod11Offset
	.long LMod12Offset
	.long LMod13Offset
	.long LMod14Offset
	.long LMod15Offset


// Very long forward moves.  These are at least several pages.  They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark.  There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere.  We call
// the longcopy routine using the normal ABI:
//      rdi = dest
//      rsi = source
//      rdx = length (>= kVeryLong bytes)

LVeryLong:
	pushq	%r11			// save return value
	movq	$_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax
	call	*%rax			// call very long operand routine
	popq	%rax			// pop return value
	popq	%rbp
	ret	


// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.

Lfastpath:
        addq    %rcx,%rsi               // restore ptrs to 1st byte of source and dest
        addq    %rcx,%rdi
	negl	%ecx			// make length positive (known to be < 2GB)
	orl	%edx,%ecx		// restore total #bytes remaining to move
	cld				// we'll move forward
	shrl	$2,%ecx			// compute #words to move
	rep				// the u-code will optimize this
	movsl
	jmp	LLeftovers		// handle 0..3 leftover bytes


// Forward loop for medium length operands in which low four bits of %rsi == 0000

LMod0:
	cmpl	$(-kFastUCode),%ecx	// %rcx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	jmp	1f
	.align	4,0x90			// 16-byte align inner loops
1:					// loop over 64-byte chunks
        movdqa  (%rsi,%rcx),%xmm0
        movdqa  16(%rsi,%rcx),%xmm1
        movdqa  32(%rsi,%rcx),%xmm2
        movdqa  48(%rsi,%rcx),%xmm3

        movdqa  %xmm0,(%rdi,%rcx)
        movdqa  %xmm1,16(%rdi,%rcx)
        movdqa  %xmm2,32(%rdi,%rcx)
        movdqa  %xmm3,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0001

LMod1:
	movdqa	-1(%rsi,%rcx),%xmm0	// prime the loop by loading 1st quadword
1:					// loop over 64-byte chunks
        movdqa  15(%rsi,%rcx),%xmm1
        movdqa  31(%rsi,%rcx),%xmm2
        movdqa  47(%rsi,%rcx),%xmm3
        movdqa  63(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$1,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$1,%xmm2,%xmm3
	palignr	$1,%xmm1,%xmm2
	palignr	$1,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0010

LMod2:
	movdqa	-2(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  14(%rsi,%rcx),%xmm1
        movdqa  30(%rsi,%rcx),%xmm2
        movdqa  46(%rsi,%rcx),%xmm3
        movdqa  62(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$2,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$2,%xmm2,%xmm3
	palignr	$2,%xmm1,%xmm2
	palignr	$2,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0011

LMod3:
	movdqa	-3(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  13(%rsi,%rcx),%xmm1
        movdqa  29(%rsi,%rcx),%xmm2
        movdqa  45(%rsi,%rcx),%xmm3
        movdqa  61(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$3,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$3,%xmm2,%xmm3
	palignr	$3,%xmm1,%xmm2
	palignr	$3,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %rsi == 0100
// We use the float single data type in order to use "movss" to merge vectors.

LMod4:
	movaps	-4(%rsi,%rcx),%xmm0	// 4-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movaps  12(%rsi,%rcx),%xmm1
        movaps  28(%rsi,%rcx),%xmm2
	movss	%xmm1,%xmm0		// copy low 4 bytes of source into destination
	pshufd	$(0x39),%xmm0,%xmm0	// rotate right 4 bytes (mask -- 00 11 10 01)
        movaps  44(%rsi,%rcx),%xmm3
	movss	%xmm2,%xmm1
	pshufd	$(0x39),%xmm1,%xmm1
	movaps	60(%rsi,%rcx),%xmm4
	movss	%xmm3,%xmm2
	pshufd	$(0x39),%xmm2,%xmm2

        movaps  %xmm0,(%rdi,%rcx)
	movss	%xmm4,%xmm3
	pshufd	$(0x39),%xmm3,%xmm3
        movaps  %xmm1,16(%rdi,%rcx)
        movaps  %xmm2,32(%rdi,%rcx)
	movaps	%xmm4,%xmm0
        movaps  %xmm3,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0101

LMod5:
	movdqa	-5(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  11(%rsi,%rcx),%xmm1
        movdqa  27(%rsi,%rcx),%xmm2
        movdqa  43(%rsi,%rcx),%xmm3
        movdqa  59(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$5,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$5,%xmm2,%xmm3
	palignr	$5,%xmm1,%xmm2
	palignr	$5,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0110

LMod6:
	movdqa	-6(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  10(%rsi,%rcx),%xmm1
        movdqa  26(%rsi,%rcx),%xmm2
        movdqa  42(%rsi,%rcx),%xmm3
        movdqa  58(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$6,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$6,%xmm2,%xmm3
	palignr	$6,%xmm1,%xmm2
	palignr	$6,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 0111

LMod7:
	movdqa	-7(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  9(%rsi,%rcx),%xmm1
        movdqa  25(%rsi,%rcx),%xmm2
        movdqa  41(%rsi,%rcx),%xmm3
        movdqa  57(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$7,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$7,%xmm2,%xmm3
	palignr	$7,%xmm1,%xmm2
	palignr	$7,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %rsi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.

LMod8:
	cmpl	$(-kFastUCode),%ecx	// %rcx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	movapd	-8(%rsi,%rcx),%xmm0	// 8-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movapd  8(%rsi,%rcx),%xmm1
        movapd  24(%rsi,%rcx),%xmm2
	shufpd	$01,%xmm1,%xmm0		// %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
        movapd  40(%rsi,%rcx),%xmm3
	shufpd	$01,%xmm2,%xmm1
	movapd	56(%rsi,%rcx),%xmm4
	shufpd	$01,%xmm3,%xmm2

        movapd  %xmm0,(%rdi,%rcx)
	shufpd	$01,%xmm4,%xmm3
        movapd  %xmm1,16(%rdi,%rcx)
        movapd  %xmm2,32(%rdi,%rcx)
	movapd	%xmm4,%xmm0
        movapd  %xmm3,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1001

LMod9:
	movdqa	-9(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  7(%rsi,%rcx),%xmm1
        movdqa  23(%rsi,%rcx),%xmm2
        movdqa  39(%rsi,%rcx),%xmm3
        movdqa  55(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$9,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$9,%xmm2,%xmm3
	palignr	$9,%xmm1,%xmm2
	palignr	$9,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1010

LMod10:
	movdqa	-10(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  6(%rsi,%rcx),%xmm1
        movdqa  22(%rsi,%rcx),%xmm2
        movdqa  38(%rsi,%rcx),%xmm3
        movdqa  54(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$10,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$10,%xmm2,%xmm3
	palignr	$10,%xmm1,%xmm2
	palignr	$10,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1011

LMod11:
	movdqa	-11(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  5(%rsi,%rcx),%xmm1
        movdqa  21(%rsi,%rcx),%xmm2
        movdqa  37(%rsi,%rcx),%xmm3
        movdqa  53(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$11,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$11,%xmm2,%xmm3
	palignr	$11,%xmm1,%xmm2
	palignr	$11,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done
	

// Forward loop for medium length operands in which low four bits of %rsi == 1100
// We use the float single data type in order to use "movss" to merge vectors.

LMod12:
	movss	(%rsi,%rcx),%xmm0	// prefetch 1st four bytes of source, right justified
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
	pshufd	$(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
	pshufd	$(0x93),20(%rsi,%rcx),%xmm2
	pshufd	$(0x93),36(%rsi,%rcx),%xmm3
	pshufd	$(0x93),52(%rsi,%rcx),%xmm4
	
	movaps	%xmm4,%xmm5
	movss	%xmm3,%xmm4		// copy low 4 bytes of source into destination
	movss	%xmm2,%xmm3
	movss	%xmm1,%xmm2
	movss	%xmm0,%xmm1
	
        movaps  %xmm1,(%rdi,%rcx)
        movaps  %xmm2,16(%rdi,%rcx)
	movaps	%xmm5,%xmm0
        movaps  %xmm3,32(%rdi,%rcx)
        movaps  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1101

LMod13:
	movdqa	-13(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  3(%rsi,%rcx),%xmm1
        movdqa  19(%rsi,%rcx),%xmm2
        movdqa  35(%rsi,%rcx),%xmm3
        movdqa  51(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$13,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$13,%xmm2,%xmm3
	palignr	$13,%xmm1,%xmm2
	palignr	$13,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1110

LMod14:
	movdqa	-14(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  2(%rsi,%rcx),%xmm1
        movdqa  18(%rsi,%rcx),%xmm2
        movdqa  34(%rsi,%rcx),%xmm3
        movdqa  50(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$14,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$14,%xmm2,%xmm3
	palignr	$14,%xmm1,%xmm2
	palignr	$14,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %rsi == 1111

LMod15:
	movdqa	-15(%rsi,%rcx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  1(%rsi,%rcx),%xmm1
        movdqa  17(%rsi,%rcx),%xmm2
        movdqa  33(%rsi,%rcx),%xmm3
        movdqa  49(%rsi,%rcx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$15,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$15,%xmm2,%xmm3
	palignr	$15,%xmm1,%xmm2
	palignr	$15,%xmm5,%xmm1
	
        movdqa  %xmm1,(%rdi,%rcx)
        movdqa  %xmm2,16(%rdi,%rcx)
        movdqa  %xmm3,32(%rdi,%rcx)
        movdqa  %xmm4,48(%rdi,%rcx)
        
        addq    $64,%rcx
        jnz     1b
        
        jmp     LShort                  // copy remaining 0..63 bytes and done
	

// Reverse moves.  These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
//      rdx = length
//      rsi = source ptr
//      rdi = dest ptr

LReverse:
        addq    %rdx,%rsi               // point to end of strings
        addq    %rdx,%rdi
        cmpq    $(kShort),%rdx          // long enough to bother with SSE?
        ja      LReverseNotShort        // yes

// Handle reverse short copies.
//      edx = length (<= kShort)
//      rsi = one byte past end of source
//      rdi = one byte past end of dest

LReverseShort:
	movl    %edx,%ecx		// copy length
	shrl	$3,%ecx			// #quadwords
	jz	3f
1:
	subq	$8,%rsi
	movq	(%rsi),%rax
	subq	$8,%rdi
	movq	%rax,(%rdi)
	decl	%ecx
	jnz	1b
3:
	andl	$7,%edx			// bytes?
	jz	5f
4:
	decq	%rsi
	movb	(%rsi),%al
	decq	%rdi
	movb	%al,(%rdi)
	decl	%edx
	jnz	4b
5:
        movq	%r11,%rax		// get return value (dst ptr) for memcpy/memmove
	popq	%rbp
        ret

// Handle a reverse move long enough to justify using SSE.
//      rdx = length (> kShort)
//      rsi = one byte past end of source
//      rdi = one byte past end of dest

LReverseNotShort:
        movl    %edi,%ecx               // copy destination
        andl    $15,%ecx                // get #bytes to align destination
        je      LReverseDestAligned     // already aligned
        subq	%rcx,%rdx		// adjust length
1:					// loop copying 1..15 bytes
	decq	%rsi
	movb	(%rsi),%al
	decq	%rdi
	movb	%al,(%rdi)
	decl	%ecx
	jnz	1b
        
// Destination is now aligned.  Prepare for reverse loops.

LReverseDestAligned:
        movq    %rdx,%rcx               // copy length
        andl    $63,%edx                // get remaining bytes for LReverseShort
        andq    $-64,%rcx               // get number of bytes we will copy in inner loop
        subq    %rcx,%rsi               // point to endpoint of copy
        subq    %rcx,%rdi
	testl	$15,%esi		// is source aligned too?
        jnz     LReverseUnalignedLoop   // no

LReverseAlignedLoop:                    // loop over 64-byte chunks
        movdqa  -16(%rsi,%rcx),%xmm0
        movdqa  -32(%rsi,%rcx),%xmm1
        movdqa  -48(%rsi,%rcx),%xmm2
        movdqa  -64(%rsi,%rcx),%xmm3

        movdqa  %xmm0,-16(%rdi,%rcx)
        movdqa  %xmm1,-32(%rdi,%rcx)
        movdqa  %xmm2,-48(%rdi,%rcx)
        movdqa  %xmm3,-64(%rdi,%rcx)
        
        subq    $64,%rcx
        jne     LReverseAlignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done

    
// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
        
LReverseUnalignedLoop:                  // loop over 64-byte chunks
        movdqu  -16(%rsi,%rcx),%xmm0
        movdqu  -32(%rsi,%rcx),%xmm1
        movdqu  -48(%rsi,%rcx),%xmm2
        movdqu  -64(%rsi,%rcx),%xmm3
        
        movdqa  %xmm0,-16(%rdi,%rcx)
        movdqa  %xmm1,-32(%rdi,%rcx)
        movdqa  %xmm2,-48(%rdi,%rcx)
        movdqa  %xmm3,-64(%rdi,%rcx)
        
        subq    $64,%rcx
        jne     LReverseUnalignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done

COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)