bcopy_sse4.s   [plain text]


/*
 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 *
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 *
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 *
 * @APPLE_LICENSE_HEADER_END@
 */
        
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/*
 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
 * and 64-byte cache lines.
 *
 * The following #defines are tightly coupled to the u-architecture:
 */

#define kShort  80			// too short to bother with SSE (must be >=80)
#define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
#define kFastUCode  ((16*1024)-15)	// cutoff for microcode fastpath for "rep/movsl"


// void bcopy(const void *src, void *dst, size_t len);
 
        .text
        .align	5, 0x90
LZero:
Lbcopy_sse4:				// void bcopy(const void *src, void *dst, size_t len)
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
        pushl   %esi
        pushl   %edi
        movl    8(%ebp),%esi		// get source ptr
        movl    12(%ebp),%edi           // get dest ptr
        movl    16(%ebp),%ecx           // get length
        movl    %edi,%edx
        subl    %esi,%edx               // (dest - source)
        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        jbe     Lshort			// no
	jmp	LNotShort

//
// void *memcpy(void *dst, const void *src, size_t len);
// void *memmove(void *dst, const void *src, size_t len);
//
// NB: These need to be 32 bytes from bcopy():
//

        .align	5, 0x90
Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
        pushl   %esi
        pushl   %edi
        movl    8(%ebp),%edi		// get dest ptr
        movl    12(%ebp),%esi           // get source ptr
        movl    16(%ebp),%ecx           // get length
        movl    %edi,%edx
        subl    %esi,%edx               // (dest - source)
        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        ja      LNotShort               // yes
        
// Handle short forward copies.  As the most common case, this is the fall-through path.
//      ecx = length (<= kShort)
//      esi = source ptr
//      edi = dest ptr

Lshort:
	movl    %ecx,%edx		// copy length
	shrl	$2,%ecx			// get #doublewords
	jz	LLeftovers
2:					// loop copying doublewords
	movl	(%esi),%eax
	addl	$4,%esi
	movl	%eax,(%edi)
	addl	$4,%edi
	dec	%ecx
	jnz	2b
LLeftovers:				// handle leftover bytes (0..3) in last word
	andl	$3,%edx			// any leftover bytes?
	jz	Lexit
4:					// loop copying bytes
	movb	(%esi),%al
	inc	%esi
	movb	%al,(%edi)
	inc	%edi
	dec	%edx
	jnz	4b
Lexit:
        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
        popl    %edi
        popl    %esi
	popl	%ebp
        ret


LReverseIsland:				// keep the "jb" above a short branch...
	jmp	LReverse		// ...because reverse moves are uncommon


// Handle forward moves that are long enough to justify use of SSE3.
// First, 16-byte align the destination.
//      ecx = length (> kShort)
//      esi = source ptr
//      edi = dest ptr

LNotShort:
        cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
        movl    %edi,%edx               // copy destination
        jae     LVeryLong		// use very-long-operand path
        negl    %edx
        andl    $15,%edx                // get #bytes to align destination
	jz	LDestAligned		// already aligned
        subl    %edx,%ecx               // decrement length
1:					// loop copying 1..15 bytes
	movb	(%esi),%al
	inc	%esi
	movb	%al,(%edi)
	inc	%edi
	dec	%edx
	jnz	1b
        
// Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source.  All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk.  When we enter the copy loops, the following registers
// are set up:
//      ecx = residual length (0..63)
//	edx = -(length to move), a multiple of 64
//      esi = ptr to 1st source byte not to move (unaligned)
//      edi = ptr to 1st dest byte not to move (aligned)

LDestAligned:
        movl    %ecx,%edx               // copy length
	movl	%esi,%eax		// copy source address
        andl    $63,%ecx                // get remaining bytes for Lshort
        andl    $-64,%edx               // get number of bytes we will copy in inner loop
	andl	$15,%eax		// mask to low 4 bits of source address
        addl    %edx,%esi               // point to 1st byte not copied
        addl    %edx,%edi
        negl    %edx                    // now generate offset to 1st byte to be copied
	movl	(_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
	jmp	*%eax
	
	.align	2
LTable:					// table of copy loop addresses
	.long	LMod0 + _COMM_PAGE_BCOPY - LZero
	.long	LMod1 + _COMM_PAGE_BCOPY - LZero
	.long	LMod2 + _COMM_PAGE_BCOPY - LZero
	.long	LMod3 + _COMM_PAGE_BCOPY - LZero
	.long	LMod4 + _COMM_PAGE_BCOPY - LZero
	.long	LMod5 + _COMM_PAGE_BCOPY - LZero
	.long	LMod6 + _COMM_PAGE_BCOPY - LZero
	.long	LMod7 + _COMM_PAGE_BCOPY - LZero
	.long	LMod8 + _COMM_PAGE_BCOPY - LZero
	.long	LMod9 + _COMM_PAGE_BCOPY - LZero
	.long	LMod10 + _COMM_PAGE_BCOPY - LZero
	.long	LMod11 + _COMM_PAGE_BCOPY - LZero
	.long	LMod12 + _COMM_PAGE_BCOPY - LZero
	.long	LMod13 + _COMM_PAGE_BCOPY - LZero
	.long	LMod14 + _COMM_PAGE_BCOPY - LZero
	.long	LMod15 + _COMM_PAGE_BCOPY - LZero


// Very long forward moves.  These are at least several pages.  They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark.  There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere.  We call
// the longcopy routine using the normal ABI.

LVeryLong:
	pushl	%ecx			// length (>= kVeryLong)
	pushl	%esi			// source ptr
	pushl	%edi			// dest ptr
	movl	$(_COMM_PAGE_LONGCOPY),%eax
	call	*%eax			// do the long copy
	addl	$12,%esp		// pop off our parameters
	jmp	Lexit


// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.

Lfastpath:
        addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
        addl    %edx,%edi
	negl	%edx			// make length positive
	orl	%edx,%ecx		// restore total #bytes remaining to move
	cld				// we'll move forward
	movl	%ecx,%edx		// copy total length to move
	shrl	$2,%ecx			// compute #words to move
	rep				// the u-code will optimize this
	movsl
	jmp	LLeftovers		// handle 0..3 leftover bytes


// Forward loop for medium length operands in which low four bits of %esi == 0000

LMod0:
	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	jmp	1f
	.align	4,0x90			// 16-byte align inner loops
1:					// loop over 64-byte chunks
        movdqa  (%esi,%edx),%xmm0
        movdqa  16(%esi,%edx),%xmm1
        movdqa  32(%esi,%edx),%xmm2
        movdqa  48(%esi,%edx),%xmm3

        movdqa  %xmm0,(%edi,%edx)
        movdqa  %xmm1,16(%edi,%edx)
        movdqa  %xmm2,32(%edi,%edx)
        movdqa  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0001

LMod1:
	movdqa	-1(%esi,%edx),%xmm0	// prime the loop by loading 1st quadword
1:					// loop over 64-byte chunks
        movdqa  15(%esi,%edx),%xmm1
        movdqa  31(%esi,%edx),%xmm2
        movdqa  47(%esi,%edx),%xmm3
        movdqa  63(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$1,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$1,%xmm2,%xmm3
	palignr	$1,%xmm1,%xmm2
	palignr	$1,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0010

LMod2:
	movdqa	-2(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  14(%esi,%edx),%xmm1
        movdqa  30(%esi,%edx),%xmm2
        movdqa  46(%esi,%edx),%xmm3
        movdqa  62(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$2,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$2,%xmm2,%xmm3
	palignr	$2,%xmm1,%xmm2
	palignr	$2,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0011

LMod3:
	movdqa	-3(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  13(%esi,%edx),%xmm1
        movdqa  29(%esi,%edx),%xmm2
        movdqa  45(%esi,%edx),%xmm3
        movdqa  61(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$3,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$3,%xmm2,%xmm3
	palignr	$3,%xmm1,%xmm2
	palignr	$3,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %esi == 0100
// We use the float single data type in order to use "movss" to merge vectors.

LMod4:
	movaps	-4(%esi,%edx),%xmm0	// 4-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movaps  12(%esi,%edx),%xmm1
        movaps  28(%esi,%edx),%xmm2
	movss	%xmm1,%xmm0		// copy low 4 bytes of source into destination
	pshufd	$(0x39),%xmm0,%xmm0	// rotate right 4 bytes (mask -- 00 11 10 01)
        movaps  44(%esi,%edx),%xmm3
	movss	%xmm2,%xmm1
	pshufd	$(0x39),%xmm1,%xmm1
	movaps	60(%esi,%edx),%xmm4
	movss	%xmm3,%xmm2
	pshufd	$(0x39),%xmm2,%xmm2

        movaps  %xmm0,(%edi,%edx)
	movss	%xmm4,%xmm3
	pshufd	$(0x39),%xmm3,%xmm3
        movaps  %xmm1,16(%edi,%edx)
        movaps  %xmm2,32(%edi,%edx)
	movaps	%xmm4,%xmm0
        movaps  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0101

LMod5:
	movdqa	-5(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  11(%esi,%edx),%xmm1
        movdqa  27(%esi,%edx),%xmm2
        movdqa  43(%esi,%edx),%xmm3
        movdqa  59(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$5,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$5,%xmm2,%xmm3
	palignr	$5,%xmm1,%xmm2
	palignr	$5,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0110

LMod6:
	movdqa	-6(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  10(%esi,%edx),%xmm1
        movdqa  26(%esi,%edx),%xmm2
        movdqa  42(%esi,%edx),%xmm3
        movdqa  58(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$6,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$6,%xmm2,%xmm3
	palignr	$6,%xmm1,%xmm2
	palignr	$6,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0111

LMod7:
	movdqa	-7(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  9(%esi,%edx),%xmm1
        movdqa  25(%esi,%edx),%xmm2
        movdqa  41(%esi,%edx),%xmm3
        movdqa  57(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$7,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$7,%xmm2,%xmm3
	palignr	$7,%xmm1,%xmm2
	palignr	$7,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %esi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.

LMod8:
	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	movapd	-8(%esi,%edx),%xmm0	// 8-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movapd  8(%esi,%edx),%xmm1
        movapd  24(%esi,%edx),%xmm2
	shufpd	$01,%xmm1,%xmm0		// %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
        movapd  40(%esi,%edx),%xmm3
	shufpd	$01,%xmm2,%xmm1
	movapd	56(%esi,%edx),%xmm4
	shufpd	$01,%xmm3,%xmm2

        movapd  %xmm0,(%edi,%edx)
	shufpd	$01,%xmm4,%xmm3
        movapd  %xmm1,16(%edi,%edx)
        movapd  %xmm2,32(%edi,%edx)
	movapd	%xmm4,%xmm0
        movapd  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1001

LMod9:
	movdqa	-9(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  7(%esi,%edx),%xmm1
        movdqa  23(%esi,%edx),%xmm2
        movdqa  39(%esi,%edx),%xmm3
        movdqa  55(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$9,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$9,%xmm2,%xmm3
	palignr	$9,%xmm1,%xmm2
	palignr	$9,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1010

LMod10:
	movdqa	-10(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  6(%esi,%edx),%xmm1
        movdqa  22(%esi,%edx),%xmm2
        movdqa  38(%esi,%edx),%xmm3
        movdqa  54(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$10,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$10,%xmm2,%xmm3
	palignr	$10,%xmm1,%xmm2
	palignr	$10,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1011

LMod11:
	movdqa	-11(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  5(%esi,%edx),%xmm1
        movdqa  21(%esi,%edx),%xmm2
        movdqa  37(%esi,%edx),%xmm3
        movdqa  53(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$11,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$11,%xmm2,%xmm3
	palignr	$11,%xmm1,%xmm2
	palignr	$11,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	

// Forward loop for medium length operands in which low four bits of %esi == 1100
// We use the float single data type in order to use "movss" to merge vectors.

LMod12:
	movss	(%esi,%edx),%xmm0	// prefetch 1st four bytes of source, right justified
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
	pshufd	$(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
	pshufd	$(0x93),20(%esi,%edx),%xmm2
	pshufd	$(0x93),36(%esi,%edx),%xmm3
	pshufd	$(0x93),52(%esi,%edx),%xmm4
	
	movaps	%xmm4,%xmm5
	movss	%xmm3,%xmm4		// copy low 4 bytes of source into destination
	movss	%xmm2,%xmm3
	movss	%xmm1,%xmm2
	movss	%xmm0,%xmm1
	
        movaps  %xmm1,(%edi,%edx)
        movaps  %xmm2,16(%edi,%edx)
	movaps	%xmm5,%xmm0
        movaps  %xmm3,32(%edi,%edx)
        movaps  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1101

LMod13:
	movdqa	-13(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  3(%esi,%edx),%xmm1
        movdqa  19(%esi,%edx),%xmm2
        movdqa  35(%esi,%edx),%xmm3
        movdqa  51(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$13,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$13,%xmm2,%xmm3
	palignr	$13,%xmm1,%xmm2
	palignr	$13,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1110

LMod14:
	movdqa	-14(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  2(%esi,%edx),%xmm1
        movdqa  18(%esi,%edx),%xmm2
        movdqa  34(%esi,%edx),%xmm3
        movdqa  50(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$14,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$14,%xmm2,%xmm3
	palignr	$14,%xmm1,%xmm2
	palignr	$14,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1111

LMod15:
	movdqa	-15(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  1(%esi,%edx),%xmm1
        movdqa  17(%esi,%edx),%xmm2
        movdqa  33(%esi,%edx),%xmm3
        movdqa  49(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$15,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$15,%xmm2,%xmm3
	palignr	$15,%xmm1,%xmm2
	palignr	$15,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
        

// Reverse moves.  These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
//      ecx = length
//      esi = source ptr
//      edi = dest ptr

LReverse:
        addl    %ecx,%esi               // point to end of strings
        addl    %ecx,%edi
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        ja      LReverseNotShort        // yes

// Handle reverse short copies.
//      ecx = length
//      esi = one byte past end of source
//      edi = one byte past end of dest

LReverseShort:
	movl    %ecx,%edx		// copy length
	shrl	$2,%ecx			// #words
	jz	3f
1:
	subl	$4,%esi
	movl	(%esi),%eax
	subl	$4,%edi
	movl	%eax,(%edi)
	dec	%ecx
	jnz	1b
3:
	andl	$3,%edx			// bytes?
	jz	5f
4:
	dec	%esi
	movb	(%esi),%al
	dec	%edi
	movb	%al,(%edi)
	dec	%edx
	jnz	4b
5:
        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
        popl    %edi
        popl    %esi
	popl	%ebp
        ret

// Handle a reverse move long enough to justify using SSE.
//      ecx = length
//      esi = one byte past end of source
//      edi = one byte past end of dest

LReverseNotShort:
        movl    %edi,%edx               // copy destination
        andl    $15,%edx                // get #bytes to align destination
        je      LReverseDestAligned     // already aligned
        subl	%edx,%ecx		// adjust length
1:					// loop copying 1..15 bytes
	dec	%esi
	movb	(%esi),%al
	dec	%edi
	movb	%al,(%edi)
	dec	%edx
	jnz	1b
        
// Destination is now aligned.  Prepare for reverse loops.

LReverseDestAligned:
        movl    %ecx,%edx               // copy length
        andl    $63,%ecx                // get remaining bytes for Lshort
        andl    $-64,%edx               // get number of bytes we will copy in inner loop
        subl    %edx,%esi               // point to endpoint of copy
        subl    %edx,%edi
	testl	$15,%esi		// is source aligned too?
        jnz     LReverseUnalignedLoop   // no

LReverseAlignedLoop:                    // loop over 64-byte chunks
        movdqa  -16(%esi,%edx),%xmm0
        movdqa  -32(%esi,%edx),%xmm1
        movdqa  -48(%esi,%edx),%xmm2
        movdqa  -64(%esi,%edx),%xmm3

        movdqa  %xmm0,-16(%edi,%edx)
        movdqa  %xmm1,-32(%edi,%edx)
        movdqa  %xmm2,-48(%edi,%edx)
        movdqa  %xmm3,-64(%edi,%edx)
        
        subl    $64,%edx
        jne     LReverseAlignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done

    
// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
        
LReverseUnalignedLoop:                  // loop over 64-byte chunks
        movdqu  -16(%esi,%edx),%xmm0
        movdqu  -32(%esi,%edx),%xmm1
        movdqu  -48(%esi,%edx),%xmm2
        movdqu  -64(%esi,%edx),%xmm3
        
        movdqa  %xmm0,-16(%edi,%edx)
        movdqa  %xmm1,-32(%edi,%edx)
        movdqa  %xmm2,-48(%edi,%edx)
        movdqa  %xmm3,-64(%edi,%edx)
        
        subl    $64,%edx
        jne     LReverseUnalignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done


	COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)