/* * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* ======================================= * BCOPY, MEMCPY, and MEMMOVE for Mac OS X * ======================================= * * Version of 2/20/2003, for a hypothetic 64-bit processor without Altivec. * This version might be used bringing up new processors, with known * Altivec bugs that need to be worked around. It is not particularly well * optimized. * * For 64-bit processors with a 128-byte cache line, running in either * 32- or 64-bit mode. This is written for 32-bit execution, the kernel * will translate to 64-bit code when it compiles the 64-bit commpage. * * Register usage. Note we use R2, so this code will not run in a PEF/CFM * environment. * r0 = "w7" or temp * r2 = "w8" * r3 = not used, as memcpy and memmove return 1st parameter as a value * r4 = source ptr ("rs") * r5 = count of bytes to move ("rc") * r6 = "w1" * r7 = "w2" * r8 = "w3" * r9 = "w4" * r10 = "w5" * r11 = "w6" * r12 = destination ptr ("rd") */ #define rs r4 #define rd r12 #define rc r5 #define rv r2 #define w1 r6 #define w2 r7 #define w3 r8 #define w4 r9 #define w5 r10 #define w6 r11 #define w7 r0 #define w8 r2 #define ASSEMBLER #include <sys/appleapiopts.h> #include <ppc/asm.h> #include <machine/cpu_capabilities.h> #include <machine/commpage.h> .text #define kLong 64 // too long for inline loopless code // Main entry points. .align 5 bcopy_64: // void bcopy(const void *src, void *dst, size_t len) cmplwi rc,kLong // short or long? sub w1,r4,r3 // must move in reverse if (rd-rs)<rc cmplw cr1,w1,rc // set cr1 blt iff we must move reverse mr rd,r4 // start to move registers to canonic spot mr rs,r3 blt LShort // handle short operands dcbt 0,r3 // touch in destination b LLong // join medium/long operand code // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses. .align 5 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len) Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len) cmplwi rc,kLong // short or long? sub w1,r3,r4 // must move in reverse if (rd-rs)<rc dcbt 0,r4 // touch in the first line of source cmplw cr1,w1,rc // set cr1 blt iff we must move reverse mr rd,r3 // must leave r3 alone, it is return value for memcpy etc bge LLong // handle medium or long operands // Handle short operands. LShort: mtcrf 0x02,rc // put length bits 26-27 in cr6 (faster one cr at a time) mtcrf 0x01,rc // put length bits 28-31 in cr7 blt cr1,LShortReverse // Forward short operands. This is the most frequent case, so it is inline. LShort64: // enter to xfer last 64 bytes bf 26,0f // 64-byte chunk to xfer? ld w1,0(rs) ld w2,8(rs) ld w3,16(rs) ld w4,24(rs) addi rs,rs,32 std w1,0(rd) std w2,8(rd) std w3,16(rd) std w4,24(rd) addi rd,rd,32 0: bf 27,1f // quadword to move? ld w1,0(rs) ld w2,8(rs) addi rs,rs,16 std w1,0(rd) std w2,8(rd) addi rd,rd,16 1: bf 28,2f // doubleword? ld w1,0(rs) addi rs,rs,8 std w1,0(rd) addi rd,rd,8 2: bf 29,3f // word? lwz w1,0(rs) addi rs,rs,4 stw w1,0(rd) addi rd,rd,4 3: bf 30,4f // halfword to move? lhz w1,0(rs) addi rs,rs,2 sth w1,0(rd) addi rd,rd,2 4: bflr 31 // skip if no odd byte lbz w1,0(rs) stb w1,0(rd) blr // Handle short reverse operands. // cr6 = bits 26-27 of length // cr7 = bits 28-31 of length LShortReverse: add rs,rs,rc // adjust ptrs for reverse move add rd,rd,rc LShortReverse64: // enter to xfer last 64 bytes bf 26,0f // 64-byte chunk to xfer? ld w1,-8(rs) ld w2,-16(rs) ld w3,-24(rs) ldu w4,-32(rs) std w1,-8(rd) std w2,-16(rd) std w3,-24(rd) stdu w4,-32(rd) 0: bf 27,1f // quadword to move? ld w1,-8(rs) ldu w2,-16(rs) std w1,-8(rd) stdu w2,-16(rd) 1: bf 28,2f // doubleword? ldu w1,-8(rs) stdu w1,-8(rd) 2: bf 29,3f // word? lwzu w1,-4(rs) stwu w1,-4(rd) 3: bf 30,4f // halfword to move? lhzu w1,-2(rs) sthu w1,-2(rd) 4: bflr 31 // done if no odd byte lbz w1,-1(rs) // no update stb w1,-1(rd) blr // Long operands. // cr1 = blt iff we must move reverse .align 4 LLong: dcbtst 0,rd // touch in destination neg w3,rd // start to compute #bytes to align destination andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination blt cr1,LLongReverse // handle reverse moves mtctr w6 // set up for loop to align destination sub rc,rc,w6 // adjust count beq LAligned // destination already 8-byte aligned 1: lbz w1,0(rs) addi rs,rs,1 stb w1,0(rd) addi rd,rd,1 bdnz 1b // Destination is 8-byte aligned. LAligned: srwi. w2,rc,6 // w2 <- count of 64-byte chunks mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time) mtcrf 0x01,rc // put length bits 28-31 in cr7 beq LShort64 // no 64-byte chunks mtctr w2 b 1f // Loop moving 64-byte chunks. .align 5 1: ld w1,0(rs) ld w2,8(rs) ld w3,16(rs) ld w4,24(rs) ld w5,32(rs) ld w6,40(rs) ld w7,48(rs) ld w8,56(rs) addi rs,rs,64 std w1,0(rd) std w2,8(rd) std w3,16(rd) std w4,24(rd) std w5,32(rd) std w6,40(rd) std w7,48(rd) std w8,56(rd) addi rd,rd,64 bdnz 1b b LShort64 // Handle reverse moves. LLongReverse: add rd,rd,rc // point to end of operands add rs,rs,rc andi. r0,rd,7 // is destination 8-byte aligned? sub rc,rc,r0 // adjust count mtctr r0 // set up for byte loop beq LRevAligned // already aligned 1: lbzu w1,-1(rs) stbu w1,-1(rd) bdnz 1b // Destination is 8-byte aligned. LRevAligned: srwi. w2,rc,6 // w2 <- count of 64-byte chunks mtcrf 0x02,rc // leftover byte count to cr (faster one cr at a time) mtcrf 0x01,rc // put length bits 28-31 in cr7 beq LShortReverse64 // no 64-byte chunks mtctr w2 b 1f // Loop over 64-byte chunks (reverse). .align 5 1: ld w1,-8(rs) ld w2,-16(rs) ld w3,-24(rs) ld w4,-32(rs) ld w5,-40(rs) ld w6,-48(rs) ld w7,-56(rs) ldu w8,-64(rs) std w1,-8(rd) std w2,-16(rd) std w3,-24(rd) std w4,-32(rd) std w5,-40(rd) std w6,-48(rd) std w7,-56(rd) stdu w8,-64(rd) bdnz 1b b LShortReverse64 COMMPAGE_DESCRIPTOR(bcopy_64,_COMM_PAGE_BCOPY,k64Bit,kHasAltivec,kCommPageBoth+kPort32to64)