bcopy_sse3x.s   [plain text]


/*
 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
        
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/*
 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
 * Supplemental SSE3 and 64-byte cache lines.
 *
 * The following #defines are tightly coupled to the u-architecture:
 */

#define kShort  80			// too short to bother with SSE (must be >=80)
#define kVeryLong   (500*1024)          // large enough for non-temporal stores (must be >= 8192)
#define kFastUCode  ((16*1024)-15)	// cutoff for microcode fastpath for "rep/movsl"

// void bcopy(const void *src, void *dst, size_t len);
 
COMMPAGE_FUNCTION_START(bcopy_sse3x, 32, 5)
LZero:
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
        pushl   %esi
        pushl   %edi
        movl    8(%ebp),%esi		// get source ptr
        movl    12(%ebp),%edi           // get dest ptr
        movl    16(%ebp),%ecx           // get length
        movl    %edi,%edx
        subl    %esi,%edx               // (dest - source)
        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        jbe     Lshort			// no
	jmp	LNotShort

//
// void *memcpy(void *dst, const void *src, size_t len);
// void *memmove(void *dst, const void *src, size_t len);
//
// NB: These need to be 32 bytes from bcopy():
//

        .align	5, 0x90
Lmemcpy:				// void *memcpy(void *dst, const void *src, size_t len)
Lmemmove:				// void *memmove(void *dst, const void *src, size_t len)
	pushl	%ebp			// set up a frame for backtraces
	movl	%esp,%ebp
        pushl   %esi
        pushl   %edi
        movl    8(%ebp),%edi		// get dest ptr
        movl    12(%ebp),%esi           // get source ptr
        movl    16(%ebp),%ecx           // get length
        movl    %edi,%edx
        subl    %esi,%edx               // (dest - source)
        cmpl    %ecx,%edx               // must move in reverse if (dest - source) < length
        jb      LReverseIsland
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        ja      LNotShort               // yes
        
// Handle short forward copies.  As the most common case, this is the fall-through path.
//      ecx = length (<= kShort)
//      esi = source ptr
//      edi = dest ptr

Lshort:
	movl    %ecx,%edx		// copy length
	shrl	$2,%ecx			// get #doublewords
	jz	LLeftovers
2:					// loop copying doublewords
	movl	(%esi),%eax
	addl	$4,%esi
	movl	%eax,(%edi)
	addl	$4,%edi
	dec	%ecx
	jnz	2b
LLeftovers:				// handle leftover bytes (0..3) in last word
	andl	$3,%edx			// any leftover bytes?
	jz	Lexit
4:					// loop copying bytes
	movb	(%esi),%al
	inc	%esi
	movb	%al,(%edi)
	inc	%edi
	dec	%edx
	jnz	4b
Lexit:
        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
        popl    %edi
        popl    %esi
	popl	%ebp
        ret


LReverseIsland:				// keep the "jb" above a short branch...
	jmp	LReverse		// ...because reverse moves are uncommon


// Handle forward moves that are long enough to justify use of SSE3.
// First, 16-byte align the destination.
//      ecx = length (> kShort)
//      esi = source ptr
//      edi = dest ptr

LNotShort:
        cmpl    $(kVeryLong),%ecx       // long enough to justify heavyweight loops?
        movl    %edi,%edx               // copy destination
        jae     LVeryLong		// use very-long-operand path
        negl    %edx
        andl    $15,%edx                // get #bytes to align destination
	jz	LDestAligned		// already aligned
        subl    %edx,%ecx               // decrement length
1:					// loop copying 1..15 bytes
	movb	(%esi),%al
	inc	%esi
	movb	%al,(%edi)
	inc	%edi
	dec	%edx
	jnz	1b
        
// Destination is now aligned.  Dispatch to one of sixteen loops over 64-byte chunks,
// based on the alignment of the source.  All vector loads and stores are aligned.
// Even though this means we have to shift and repack vectors, doing so is much faster
// than unaligned loads.  Since kShort>=80 and we've moved at most 15 bytes already,
// there is at least one chunk.  When we enter the copy loops, the following registers
// are set up:
//      ecx = residual length (0..63)
//	edx = -(length to move), a multiple of 64
//      esi = ptr to 1st source byte not to move (unaligned)
//      edi = ptr to 1st dest byte not to move (aligned)

LDestAligned:
        movl    %ecx,%edx               // copy length
	movl	%esi,%eax		// copy source address
        andl    $63,%ecx                // get remaining bytes for Lshort
        andl    $-64,%edx               // get number of bytes we will copy in inner loop
	andl	$15,%eax		// mask to low 4 bits of source address
        addl    %edx,%esi               // point to 1st byte not copied
        addl    %edx,%edi
        negl    %edx                    // now generate offset to 1st byte to be copied
.set LTableOffset, LTable - LZero
	leal	(LTableOffset)(,%eax,4), %eax	// load jump table entry address, relative to LZero
	movl	_COMM_PAGE_BCOPY(%eax), %eax	// load jump table entry
	addl	$(_COMM_PAGE_BCOPY), %eax	// add runtime address of LZero to get final function
	jmp	*%eax
	
	.align	2
LTable:					// table of copy loop addresses
// force generation of assembly-time constants. Otherwise assembler
// creates subtractor relocations relative to first external symbol,
// and this file has none
.set LMod0Offset, LMod0 - LZero
.set LMod1Offset, LMod1 - LZero
.set LMod2Offset, LMod2 - LZero
.set LMod3Offset, LMod3 - LZero
.set LMod4Offset, LMod4 - LZero
.set LMod5Offset, LMod5 - LZero
.set LMod6Offset, LMod6 - LZero
.set LMod7Offset, LMod7 - LZero
.set LMod8Offset, LMod8 - LZero
.set LMod9Offset, LMod9 - LZero
.set LMod10Offset, LMod10 - LZero
.set LMod11Offset, LMod11 - LZero
.set LMod12Offset, LMod12 - LZero
.set LMod13Offset, LMod13 - LZero
.set LMod14Offset, LMod14 - LZero
.set LMod15Offset, LMod15 - LZero
	.long LMod0Offset
	.long LMod1Offset
	.long LMod2Offset
	.long LMod3Offset
	.long LMod4Offset
	.long LMod5Offset
	.long LMod6Offset
	.long LMod7Offset
	.long LMod8Offset
	.long LMod9Offset
	.long LMod10Offset
	.long LMod11Offset
	.long LMod12Offset
	.long LMod13Offset
	.long LMod14Offset
	.long LMod15Offset


// Very long forward moves.  These are at least several pages.  They are special cased
// and aggressively optimized, not so much because they are common or useful, but
// because they are subject to benchmark.  There isn't enough room for them in the
// area reserved on the commpage for bcopy, so we put them elsewhere.  We call
// the longcopy routine using the normal ABI.

LVeryLong:
	pushl	%ecx			// length (>= kVeryLong)
	pushl	%esi			// source ptr
	pushl	%edi			// dest ptr
	movl	$(_COMM_PAGE_LONGCOPY),%eax
	call	*%eax			// do the long copy
	addl	$12,%esp		// pop off our parameters
	jmp	Lexit


// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
// about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
// avoids having to read destination cache lines that will be completely overwritten.
// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
// we do not know if the destination is in cache or not.

Lfastpath:
        addl    %edx,%esi               // restore ptrs to 1st byte of source and dest
        addl    %edx,%edi
	negl	%edx			// make length positive
	orl	%edx,%ecx		// restore total #bytes remaining to move
	cld				// we'll move forward
	movl	%ecx,%edx		// copy total length to move
	shrl	$2,%ecx			// compute #words to move
	rep				// the u-code will optimize this
	movsl
	jmp	LLeftovers		// handle 0..3 leftover bytes


// Forward loop for medium length operands in which low four bits of %esi == 0000

LMod0:
	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	jmp	1f
	.align	4,0x90			// 16-byte align inner loops
1:					// loop over 64-byte chunks
        movdqa  (%esi,%edx),%xmm0
        movdqa  16(%esi,%edx),%xmm1
        movdqa  32(%esi,%edx),%xmm2
        movdqa  48(%esi,%edx),%xmm3

        movdqa  %xmm0,(%edi,%edx)
        movdqa  %xmm1,16(%edi,%edx)
        movdqa  %xmm2,32(%edi,%edx)
        movdqa  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0001

LMod1:
	movdqa	-1(%esi,%edx),%xmm0	// prime the loop by loading 1st quadword
1:					// loop over 64-byte chunks
        movdqa  15(%esi,%edx),%xmm1
        movdqa  31(%esi,%edx),%xmm2
        movdqa  47(%esi,%edx),%xmm3
        movdqa  63(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$1,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$1,%xmm2,%xmm3
	palignr	$1,%xmm1,%xmm2
	palignr	$1,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0010

LMod2:
	movdqa	-2(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  14(%esi,%edx),%xmm1
        movdqa  30(%esi,%edx),%xmm2
        movdqa  46(%esi,%edx),%xmm3
        movdqa  62(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$2,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$2,%xmm2,%xmm3
	palignr	$2,%xmm1,%xmm2
	palignr	$2,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0011

LMod3:
	movdqa	-3(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  13(%esi,%edx),%xmm1
        movdqa  29(%esi,%edx),%xmm2
        movdqa  45(%esi,%edx),%xmm3
        movdqa  61(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$3,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$3,%xmm2,%xmm3
	palignr	$3,%xmm1,%xmm2
	palignr	$3,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %esi == 0100
// We use the float single data type in order to use "movss" to merge vectors.

LMod4:
	movaps	-4(%esi,%edx),%xmm0	// 4-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movaps  12(%esi,%edx),%xmm1
        movaps  28(%esi,%edx),%xmm2
	movss	%xmm1,%xmm0		// copy low 4 bytes of source into destination
	pshufd	$(0x39),%xmm0,%xmm0	// rotate right 4 bytes (mask -- 00 11 10 01)
        movaps  44(%esi,%edx),%xmm3
	movss	%xmm2,%xmm1
	pshufd	$(0x39),%xmm1,%xmm1
	movaps	60(%esi,%edx),%xmm4
	movss	%xmm3,%xmm2
	pshufd	$(0x39),%xmm2,%xmm2

        movaps  %xmm0,(%edi,%edx)
	movss	%xmm4,%xmm3
	pshufd	$(0x39),%xmm3,%xmm3
        movaps  %xmm1,16(%edi,%edx)
        movaps  %xmm2,32(%edi,%edx)
	movaps	%xmm4,%xmm0
        movaps  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0101

LMod5:
	movdqa	-5(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  11(%esi,%edx),%xmm1
        movdqa  27(%esi,%edx),%xmm2
        movdqa  43(%esi,%edx),%xmm3
        movdqa  59(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$5,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$5,%xmm2,%xmm3
	palignr	$5,%xmm1,%xmm2
	palignr	$5,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0110

LMod6:
	movdqa	-6(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  10(%esi,%edx),%xmm1
        movdqa  26(%esi,%edx),%xmm2
        movdqa  42(%esi,%edx),%xmm3
        movdqa  58(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$6,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$6,%xmm2,%xmm3
	palignr	$6,%xmm1,%xmm2
	palignr	$6,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 0111

LMod7:
	movdqa	-7(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  9(%esi,%edx),%xmm1
        movdqa  25(%esi,%edx),%xmm2
        movdqa  41(%esi,%edx),%xmm3
        movdqa  57(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$7,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$7,%xmm2,%xmm3
	palignr	$7,%xmm1,%xmm2
	palignr	$7,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	
	
// Forward loop for medium length operands in which low four bits of %esi == 1000
// We use the float double data type in order to use "shufpd" to shift by 8 bytes.

LMod8:
	cmpl	$(-kFastUCode),%edx	// %edx == -length, where (length < kVeryLong)
	jle	Lfastpath		// long enough for fastpath in microcode
	movapd	-8(%esi,%edx),%xmm0	// 8-byte aligned: prime the loop
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
        movapd  8(%esi,%edx),%xmm1
        movapd  24(%esi,%edx),%xmm2
	shufpd	$01,%xmm1,%xmm0		// %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
        movapd  40(%esi,%edx),%xmm3
	shufpd	$01,%xmm2,%xmm1
	movapd	56(%esi,%edx),%xmm4
	shufpd	$01,%xmm3,%xmm2

        movapd  %xmm0,(%edi,%edx)
	shufpd	$01,%xmm4,%xmm3
        movapd  %xmm1,16(%edi,%edx)
        movapd  %xmm2,32(%edi,%edx)
	movapd	%xmm4,%xmm0
        movapd  %xmm3,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1001

LMod9:
	movdqa	-9(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  7(%esi,%edx),%xmm1
        movdqa  23(%esi,%edx),%xmm2
        movdqa  39(%esi,%edx),%xmm3
        movdqa  55(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$9,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$9,%xmm2,%xmm3
	palignr	$9,%xmm1,%xmm2
	palignr	$9,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1010

LMod10:
	movdqa	-10(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  6(%esi,%edx),%xmm1
        movdqa  22(%esi,%edx),%xmm2
        movdqa  38(%esi,%edx),%xmm3
        movdqa  54(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$10,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$10,%xmm2,%xmm3
	palignr	$10,%xmm1,%xmm2
	palignr	$10,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1011

LMod11:
	movdqa	-11(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  5(%esi,%edx),%xmm1
        movdqa  21(%esi,%edx),%xmm2
        movdqa  37(%esi,%edx),%xmm3
        movdqa  53(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$11,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$11,%xmm2,%xmm3
	palignr	$11,%xmm1,%xmm2
	palignr	$11,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
	

// Forward loop for medium length operands in which low four bits of %esi == 1100
// We use the float single data type in order to use "movss" to merge vectors.

LMod12:
	movss	(%esi,%edx),%xmm0	// prefetch 1st four bytes of source, right justified
	jmp	1f
	.align	4,0x90
1:					// loop over 64-byte chunks
	pshufd	$(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
	pshufd	$(0x93),20(%esi,%edx),%xmm2
	pshufd	$(0x93),36(%esi,%edx),%xmm3
	pshufd	$(0x93),52(%esi,%edx),%xmm4
	
	movaps	%xmm4,%xmm5
	movss	%xmm3,%xmm4		// copy low 4 bytes of source into destination
	movss	%xmm2,%xmm3
	movss	%xmm1,%xmm2
	movss	%xmm0,%xmm1
	
        movaps  %xmm1,(%edi,%edx)
        movaps  %xmm2,16(%edi,%edx)
	movaps	%xmm5,%xmm0
        movaps  %xmm3,32(%edi,%edx)
        movaps  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1101

LMod13:
	movdqa	-13(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  3(%esi,%edx),%xmm1
        movdqa  19(%esi,%edx),%xmm2
        movdqa  35(%esi,%edx),%xmm3
        movdqa  51(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$13,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$13,%xmm2,%xmm3
	palignr	$13,%xmm1,%xmm2
	palignr	$13,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1110

LMod14:
	movdqa	-14(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  2(%esi,%edx),%xmm1
        movdqa  18(%esi,%edx),%xmm2
        movdqa  34(%esi,%edx),%xmm3
        movdqa  50(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$14,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$14,%xmm2,%xmm3
	palignr	$14,%xmm1,%xmm2
	palignr	$14,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done


// Forward loop for medium length operands in which low four bits of %esi == 1111

LMod15:
	movdqa	-15(%esi,%edx),%xmm0	// prime the loop by loading 1st source dq
1:					// loop over 64-byte chunks
        movdqa  1(%esi,%edx),%xmm1
        movdqa  17(%esi,%edx),%xmm2
        movdqa  33(%esi,%edx),%xmm3
        movdqa  49(%esi,%edx),%xmm4
	
	movdqa	%xmm0,%xmm5
	movdqa	%xmm4,%xmm0

	palignr	$15,%xmm3,%xmm4		// dest <- shr( dest || source, imm*8 )
	palignr	$15,%xmm2,%xmm3
	palignr	$15,%xmm1,%xmm2
	palignr	$15,%xmm5,%xmm1
	
        movdqa  %xmm1,(%edi,%edx)
        movdqa  %xmm2,16(%edi,%edx)
        movdqa  %xmm3,32(%edi,%edx)
        movdqa  %xmm4,48(%edi,%edx)
        
        addl    $64,%edx
        jnz     1b
        
        jmp     Lshort                  // copy remaining 0..63 bytes and done
        

// Reverse moves.  These are not optimized as aggressively as their forward
// counterparts, as they are only used with destructive overlap.
//      ecx = length
//      esi = source ptr
//      edi = dest ptr

LReverse:
        addl    %ecx,%esi               // point to end of strings
        addl    %ecx,%edi
        cmpl    $(kShort),%ecx          // long enough to bother with SSE?
        ja      LReverseNotShort        // yes

// Handle reverse short copies.
//      ecx = length
//      esi = one byte past end of source
//      edi = one byte past end of dest

LReverseShort:
	movl    %ecx,%edx		// copy length
	shrl	$2,%ecx			// #words
	jz	3f
1:
	subl	$4,%esi
	movl	(%esi),%eax
	subl	$4,%edi
	movl	%eax,(%edi)
	dec	%ecx
	jnz	1b
3:
	andl	$3,%edx			// bytes?
	jz	5f
4:
	dec	%esi
	movb	(%esi),%al
	dec	%edi
	movb	%al,(%edi)
	dec	%edx
	jnz	4b
5:
        movl    8(%ebp),%eax		// get return value (dst ptr) for memcpy/memmove
        popl    %edi
        popl    %esi
	popl	%ebp
        ret

// Handle a reverse move long enough to justify using SSE.
//      ecx = length
//      esi = one byte past end of source
//      edi = one byte past end of dest

LReverseNotShort:
        movl    %edi,%edx               // copy destination
        andl    $15,%edx                // get #bytes to align destination
        je      LReverseDestAligned     // already aligned
        subl	%edx,%ecx		// adjust length
1:					// loop copying 1..15 bytes
	dec	%esi
	movb	(%esi),%al
	dec	%edi
	movb	%al,(%edi)
	dec	%edx
	jnz	1b
        
// Destination is now aligned.  Prepare for reverse loops.

LReverseDestAligned:
        movl    %ecx,%edx               // copy length
        andl    $63,%ecx                // get remaining bytes for Lshort
        andl    $-64,%edx               // get number of bytes we will copy in inner loop
        subl    %edx,%esi               // point to endpoint of copy
        subl    %edx,%edi
	testl	$15,%esi		// is source aligned too?
        jnz     LReverseUnalignedLoop   // no

LReverseAlignedLoop:                    // loop over 64-byte chunks
        movdqa  -16(%esi,%edx),%xmm0
        movdqa  -32(%esi,%edx),%xmm1
        movdqa  -48(%esi,%edx),%xmm2
        movdqa  -64(%esi,%edx),%xmm3

        movdqa  %xmm0,-16(%edi,%edx)
        movdqa  %xmm1,-32(%edi,%edx)
        movdqa  %xmm2,-48(%edi,%edx)
        movdqa  %xmm3,-64(%edi,%edx)
        
        subl    $64,%edx
        jne     LReverseAlignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done

    
// Reverse, unaligned loop.  LDDQU==MOVDQU on these machines.
        
LReverseUnalignedLoop:                  // loop over 64-byte chunks
        movdqu  -16(%esi,%edx),%xmm0
        movdqu  -32(%esi,%edx),%xmm1
        movdqu  -48(%esi,%edx),%xmm2
        movdqu  -64(%esi,%edx),%xmm3
        
        movdqa  %xmm0,-16(%edi,%edx)
        movdqa  %xmm1,-32(%edi,%edx)
        movdqa  %xmm2,-48(%edi,%edx)
        movdqa  %xmm3,-64(%edi,%edx)
        
        subl    $64,%edx
        jne     LReverseUnalignedLoop
        
        jmp     LReverseShort           // copy remaining 0..63 bytes and done

COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)