/*
* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* The contents of this file constitute Original Code as defined in and
* are subject to the Apple Public Source License Version 1.1 (the
* "License"). You may not use this file except in compliance with the
* License. Please obtain a copy of the License at
* http://www.apple.com/publicsource and read it before using this file.
*
* This Original Code and all software distributed under the License are
* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
* License for the specific language governing rights and limitations
* under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
#include <machine/cpu_capabilities.h>
/* This file contains the following functions:
*
* void *memset(void *b, int c, size_t len) * void memset_pattern8(void *b, const void *c8, size_t len) *
* Calls of memset() with c==0 are routed to the bzero() routine. Most of the
* others go to _memset_pattern, which is entered as follows:
* %rdi = ptr to memory to set (aligned)
* %edx = length (which can be short, though we bias in favor of long operands)
* %xmm0 = the pattern to store
* Return conditions:
* %eax, %edi, %esi, %ecx, and %edx all trashed
*
* NB: we avoid "stos" family of instructions (stosl, stosb), as they are very slow
* on P4s and probably other processors.
*/
#define kShort 255 // for nonzero memset(), too short for commpage
.text
.globl _memset
.align 2
_memset: // void *memset(void *b, int c, size_t len) jnz LNonzero // not a bzero
movq %rdx,%rsi // put count where bzero() expects it
jmp _bzero // enter _bzero
// Handle memset of a nonzero value.
LNonzero:
movq %rdi,%r8 // preserve the original pointer so we can return it
movl %esi,%eax // replicate byte in %esi into all four bytes
shll $8,%esi
orl %esi,%eax
movl %eax,%esi
shll $16,%esi
orl %esi,%eax // now %eax has "c" in all 4 bytes
cmpq $(kShort),%rdx // is operand too short for SSE?
ja LCallCommpage // no
// Nonzero memset() too short to call commpage.
// %eax = replicated 4-byte pattern
// %rdi = ptr
// %edx = length (<= kShort)
cmpl $16,%edx // long enough to word align?
jge 3f // yes
test %edx,%edx // length==0?
jz 6f
1:
movb %al,(%rdi) // pack in a byte
addq $1,%rdi
subl $1,%edx
jnz 1b
jmp 6f
2:
movb %al,(%rdi) // pack in a byte
addq $1,%rdi
subl $1,%edx
3:
test $3,%edi // is ptr doubleword aligned?
jnz 2b // no
movl %edx,%ecx // copy length
shrl $2,%edx // #doublewords to store
4:
movl %eax,(%rdi) // store aligned doubleword
addq $4,%rdi
subl $1,%edx
jnz 4b
andl $3,%ecx // any leftover bytes?
jz 6f // no
5:
movb %al,(%rdi) // pack in a byte
addq $1,%rdi
subl $1,%ecx
jnz 5b
6:
movq %r8,%rax // get return value (ie, original ptr)
ret
// Nonzero memset() is long enough to call commpage.
// %eax = replicated 4-byte pattern
// %rdi = ptr
// %rdx = length (> kShort)
LCallCommpage:
movd %eax,%xmm0 // move %eax to low 4 bytes of %xmm0
pshufd $(0x00),%xmm0,%xmm0 // replicate across the vector
movq %rdi,%rcx // copy dest ptr
negl %ecx
andl $15,%ecx // get #bytes to align ptr
jz 2f // skip if already aligned
subq %rcx,%rdx // decrement length
1:
movb %al,(%rdi) // pack in a byte
addq $1,%rdi
subl $1,%ecx
jnz 1b
2: // ptr aligned, length long enough to justify
call Lmemset_pattern // call commpage to do the heavy lifting
movq %r8,%rax // get return value (ie, original ptr)
ret
// Handle memset of a 16-byte pattern.
.globl _memset_pattern16
.align 2, 0x90
_memset_pattern16: // void memset_pattern16(void *b, const void *c16, size_t len) jmp LAlignPtr
// Handle memset of an 8-byte pattern.
.globl _memset_pattern8
.align 2, 0x90
_memset_pattern8: // void memset_pattern8(void *b, const void *c8, size_t len) punpcklqdq %xmm0,%xmm0 // replicate into all 16
jmp LAlignPtr
// Handle memset of a 4-byte pattern.
.globl _memset_pattern4
.align 2, 0x90
_memset_pattern4: // void memset_pattern4(void *b, const void *c4, size_t len) pshufd $(0x00),%xmm0,%xmm0 // replicate the 4 bytes across the vector
// Align ptr if necessary. We must rotate the pattern right for each byte we
// store while aligning the ptr. Since there is no rotate instruction in SSE3,
// we have to synthesize the rotates.
// %rdi = ptr
// %rdx = length
// %xmm0 = pattern
LAlignPtr: // NB: can drop down to here!
cmpq $100,%rdx // long enough to bother aligning ptr?
movq %rdi,%rcx // copy ptr
jb LReady // not long enough
negl %ecx
andl $15,%ecx // get #bytes to align ptr
jz LReady // already aligned
subq %rcx,%rdx // adjust length
test $1,%cl // 1-byte store required?
movd %xmm0,%eax // get 4 low bytes in %eax
jz 2f // no
movdqa %xmm0,%xmm1 // copy pattern so we can shift in both directions
movb %al,(%rdi) // pack in the low-order byte
psrldq $1,%xmm0 // shift pattern right 1 byte
addq $1,%rdi
pslldq $15,%xmm1 // shift pattern left 15 bytes
shrl $8,%eax // in case 2-byte store is required
por %xmm1,%xmm0 // complete right rotate of pattern by 1 byte
2:
test $2,%cl // 2-byte store required?
jz 4f // no
psrldq $2,%xmm0 // shift pattern down 2 bytes
movw %ax,(%rdi) // pack in next two bytes
pinsrw $7,%eax,%xmm0 // insert low word of %eax into high word of %xmm0
addq $2,%rdi // adjust ptr
4:
test $4,%cl // 4-byte store required?
jz 8f // no
movd %xmm0,(%rdi) // store low 4 bytes of %xmm0
pshufd $(0x39),%xmm0,%xmm0 // rotate %xmm0 right 4 bytes (mask == 00 11 10 01)
addq $4,%rdi // adjust ptr
8:
test $8,%cl // 8-byte store required?
jz LReady // no
movq %xmm0,(%rdi) // store low 8 bytes of %xmm0
pshufd $(0x4e),%xmm0,%xmm0 // rotate %xmm0 right 8 bytes (mask == 01 00 11 10)
addq $8,%rdi // adjust ptr
// Ptr is aligned if practical, we're ready to call commpage to do the heavy lifting.
LReady:
call Lmemset_pattern // call commpage to do the heavy lifting
ret
#define kLShort 63
#define kVeryLong (1024*1024)
Lmemset_pattern:
cmpq $(kLShort),%rdx // long enough to bother aligning?
ja LNotShort // yes
jmp LShort // no
// Here for short operands or the end of long ones.
// %rdx = length (<= kLShort)
// %rdi = ptr (may not be not aligned)
// %xmm0 = pattern
LUnalignedStore16:
movdqu %xmm0,(%rdi) // stuff in another 16 bytes
subl $16,%edx
addq $16,%rdi
LShort:
cmpl $16,%edx // room for another vector?
jge LUnalignedStore16 // yes
LLessThan16: // here at end of copy with < 16 bytes remaining
test $8,%dl // 8-byte store required?
jz 2f // no
movq %xmm0,(%rdi) // pack in 8 low bytes
psrldq $8,%xmm0 // then shift vector down 8 bytes
addq $8,%rdi
2:
test $4,%dl // 4-byte store required?
jz 3f // no
movd %xmm0,(%rdi) // pack in 4 low bytes
psrldq $4,%xmm0 // then shift vector down 4 bytes
addq $4,%rdi
3:
andl $3,%edx // more to go?
jz 5f // no
movd %xmm0,%eax // move remainders out into %eax
4: // loop on up to three bytes
movb %al,(%rdi) // pack in next byte
shrl $8,%eax // shift next byte into position
incq %rdi
dec %edx
jnz 4b
5: ret
// Long enough to justify aligning ptr. Note that we have to rotate the
// pattern to account for any alignment. We do this by doing two unaligned
// stores, and then an aligned load from the middle of the two stores.
// This will stall on store forwarding alignment mismatch, and the unaligned
// stores can be pretty slow too, but the alternatives aren't any better.
// Fortunately, in most cases our caller has already aligned the ptr.
// %rdx = length (> kLShort)
// %rdi = ptr (may not be aligned)
// %xmm0 = pattern
LNotShort:
movl %edi,%ecx // copy low bits of dest ptr
negl %ecx
andl $15,%ecx // mask down to #bytes to 16-byte align
jz LAligned // skip if already aligned
movdqu %xmm0,(%rdi) // store 16 unaligned bytes
movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
addq %rcx,%rdi // now point to the aligned chunk
subq %rcx,%rdx // adjust remaining count
movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
addq $16,%rdi // skip past the aligned chunk
subq $16,%rdx
// Set up for 64-byte loops.
// %rdx = length remaining
// %rdi = ptr (aligned)
// %xmm0 = rotated pattern
LAligned:
movq %rdx,%rcx // copy length remaining
andl $63,%edx // mask down to residual length (0..63)
andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
jz LNoMoreChunks // no 64-byte chunks
addq %rcx,%rdi // increment ptr by length to move
cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
jge LVeryLong // yes
negq %rcx // negate length to move
jmp 1f
// Loop over 64-byte chunks, storing into cache.
.align 4,0x90 // keep inner loops 16-byte aligned
1:
movdqa %xmm0,(%rdi,%rcx)
movdqa %xmm0,16(%rdi,%rcx)
movdqa %xmm0,32(%rdi,%rcx)
movdqa %xmm0,48(%rdi,%rcx)
addq $64,%rcx
jne 1b
jmp LNoMoreChunks
// Very long operands: use non-temporal stores to bypass cache.
LVeryLong:
negq %rcx // negate length to move
jmp 1f
.align 4,0x90 // keep inner loops 16-byte aligned
1:
movntdq %xmm0,(%rdi,%rcx)
movntdq %xmm0,16(%rdi,%rcx)
movntdq %xmm0,32(%rdi,%rcx)
movntdq %xmm0,48(%rdi,%rcx)
addq $64,%rcx
jne 1b
sfence // required by non-temporal stores
jmp LNoMoreChunks
// Handle leftovers: loop by 16.
// %edx = length remaining (<64)
// %edi = ptr (aligned)
// %xmm0 = rotated pattern
LLoopBy16:
movdqa %xmm0,(%rdi) // pack in 16 more bytes
subl $16,%edx // decrement count
addq $16,%rdi // increment ptr
LNoMoreChunks:
cmpl $16,%edx // more to go?
jge LLoopBy16 // yes
jmp LLessThan16 // handle up to 15 remaining bytes