lz4_encode_armv7.s [plain text]
/*
* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. The rights granted to you under the License
* may not be used to create, or enable the creation or redistribution of,
* unlawful or unlicensed copies of an Apple operating system, or to
* circumvent, violate, or enable the circumvention or violation of, any
* terms of an Apple operating system software license agreement.
*
* Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
#include <vm/lz4_assembly_select.h>
#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7
/* void lz4_encode_2gb(uint8_t ** dst_ptr,
size_t dst_size,
const uint8_t ** src_ptr,
const uint8_t * src_begin,
size_t src_size,
lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
int skip_final_literals) */
.globl _lz4_encode_2gb
.syntax unified
#define dst_ptr r0
#define dst_end r1
#define src_ptr r2
#define src_beg r3
#define src_end r4
#define table r5
#define mch_ptr r6
#define mch_dis r8
#define mch_len r9
#define mch_ref r10
#define margin 128
.macro establish_frame
push {r4-r7, lr}
add r7, sp, #12
push {r8-r11}
ldrd r4, r5, [sp, #36]
push {r0, r2}
ldr dst_ptr, [r0]
ldr src_ptr, [r2]
subs r1, r1, margin // subtract safety margin from dst_size
bls L_done
add dst_end, dst_ptr, r1 // dst end - margin
sub r4, r4, margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop).
add src_end, src_ptr, r4 // src end - margin.
vmov.i8 q1, #255 // vector of all 1s used to emit
.endm
.macro clear_frame_and_return
pop {r1, r3}
str dst_ptr, [r1]
str src_ptr, [r3]
pop {r8-r11}
pop {r4-r7, pc}
.endm
.p2align 4
_lz4_encode_2gb:
establish_frame
L_next_match:
// Start searching for the next match, starting at the end of the last one.
// [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr
// within the search loop itself]. Load the hash magic number in lr, and
// zero out mch_len (when we find a match, its length will initially be
// four, but we actually work with the match length minus four at all times).
ldr lr, L_hash
sub mch_ptr, src_ptr, #1
L_match_candidate_loop:
// If mch_ptr >= src_end, we are near the end of the source buffer (remember
// that we subtracted margin from src_end, so we are *not* actually past the
// end just yet).
cmp mch_ptr, src_end
bhs L_trailing_literal
// Load the four-byte word starting at mch_ptr, and get the address of the
// corresponding row of the hash table.
ldr r9, [mch_ptr, #1]!
sub r8, mch_ptr, src_beg
mul r12, r9, lr
mvn r10, #0x7
and r12, r10, r12, lsr #17 // byte offset of table entry.
// Load offset and word from hash table row, then update with the new offset
// and word that we just computed.
ldrd r10,r11, [table, r12]
strd r8, r9, [table, r12]
// At this point, we only know that the hashes of the words match cmp r9, r11
bne L_match_candidate_loop
// It's not enough for the words to match sub mch_dis, r8, r10
add mch_ref, src_beg, r10
cmp mch_dis, #0x10000
bhs L_match_candidate_loop
// We have found a match // register symbolic name meaning
// r0 dst_ptr pointer into destination buffer where the
// match information will be stored.
// r1 dst_end pointer to the end of the destination buffer,
// less margin.
// r2 src_ptr pointer to the byte after the last match that
// we found, or to the point from which we
// started searching if this is the first match.
// r3 src_beg pointer to the actual start of the buffer.
// r4 src_end pointer to the end of the source buffer, less
// margin.
// r5 table address of hash table.
// r6 mch_ptr pointer to match.
// r8 mch_dis match distance ("D")
// r9 mch_len length of match less four ("M")
// r10 mch_ref pointer to match reference.
// r11 -
// r12 -
// lr -
//
// Attempt to grow the match backwards (typically we only grow backwards by
// a byte or two, if at all, so we use a byte-by-byte scan).
eor mch_len, mch_len
0:cmp mch_ref, src_beg
cmpne mch_ptr, src_ptr
beq 1f
ldrb r11, [mch_ref, #-1]
ldrb r12, [mch_ptr, #-1]
cmp r11, r12
bne 1f
sub mch_ref, #1
sub mch_ptr, #1
add mch_len, #1
b 0b
// Now that we have the start of the match, we can compute the literal
// length. Then advance the mch and ref pointers to the end of the match
// and its reference. Because mch_len is the real match length minus four,
// we actually advance to four before the end of the match, but our loop
// to grow the matches uses pre-incremented loads with writeback, so this
// works out correctly.
#define lit_len lr
1:sub lit_len, mch_ptr, src_ptr
add mch_ptr, mch_len
add mch_ref, mch_len
// Now attempt to grow the match forwards. This is much more common, and
// there is a safety margin at the end of the buffer, so we grow forwards
// in four-byte chunks.
0:ldr r11, [mch_ptr, #4]!
ldr r12, [mch_ref, #4]!
eors r11, r12
bne 1f
add mch_len, #4
cmp mch_ptr, src_end
blo 0b
b L_emit_match
// At least one of the bytes in the last comparison did not match. Identify
// which byte had the mismatch and compute the final length (less four).
1:rev r11, r11
clz r11, r11
add mch_len, r11, lsr #3
L_emit_match:
// Time to emit what we've found!
//
// register symbolic name meaning
// r0 dst_ptr pointer into destination buffer where the
// match information will be stored.
// r1 dst_end pointer to the end of the destination buffer,
// less margin.
// r2 src_ptr pointer to the byte after the last match that
// we found, or to the point from which we
// started searching if this is the first match.
// r3 src_beg pointer to the actual start of the buffer.
// r4 src_end pointer to the end of the source buffer, less
// margin.
// r5 table address of hash table.
// r6 -
// r8 mch_dis match distance ("D")
// r9 mch_len length of match ("M")
// r10 -
// r11 -
// r12 -
// lr lit_len literal length ("L")
// q1 vector of all ones
//
// Synthesize control byte under the assumption that L and M are both less
// than 15, jumping out of the fast path if one of them is not.
cmp lit_len, #15
orr r10, mch_len, lit_len, lsl #4
cmplo mch_len, #15
bhs L_emit_careful
// L and M are both less than 15, which means (a) we use the most compact
// encoding for the match and (b) we do not need to do a bounds check on
// the destination buffer before writing, only before continuing our search.
// Store the command byte.
strb r10, [dst_ptr], #1
// Copy literal.
vld1.8 q0, [src_ptr]
add src_ptr, lit_len
vst1.8 q0, [dst_ptr]
add dst_ptr, lit_len
// Restore "true" match length before updating src_ptr.
add mch_len, #4
// Store match distance (D) and update the source pointer.
strh r8, [dst_ptr], #2
add src_ptr, mch_len
// If we're not into the safety margin of the destination buffer, look for
// another match.
cmp dst_ptr, dst_end
blo L_next_match
// If we *are* into the safety margin of the destination buffer, we're done
// encoding this blockL_done:
clear_frame_and_return
// Constant island
L_hash: .long 2654435761
L_magic: .long 0x80808081
L_emit_careful:
// Either L or M is >= 15, which means that we don't get to use the compact
// encoding, and that we need to do extra bounds checks while writing.
//
// register symbolic name meaning
// r0 dst_ptr pointer into destination buffer where the
// match information will be stored.
// r1 dst_end pointer to the end of the destination buffer,
// less margin.
// r2 src_ptr pointer to the byte after the last match that
// we found, or to the point from which we
// started searching if this is the first match.
// r3 src_beg pointer to the actual start of the buffer.
// r4 src_end pointer to the end of the source buffer, less
// margin.
// r5 table address of hash table.
// r6 -
// r8 mch_dis match distance ("D")
// r9 mch_len length of match ("M") less four
// r10 -
// r11 -
// r12 -
// lr lit_len literal length ("L")
// q1 vector of all ones
//
// Start by creating the low 4 bits of the control word // division by 255 cmp mch_len, #15
mov r10, mch_len
movhs r10, #0x0f
subs r6, lit_len, #15
bhs L_large_L
// M is large, but L is < 15. This means we can use the simple approach
// for copying the literal with no bounds checks.
orr r10, lit_len, lsl #4
strb r10, [dst_ptr], #1
// Copy literal.
vld1.8 q0, [src_ptr]
add src_ptr, lit_len
vst1.8 q0, [dst_ptr]
add dst_ptr, lit_len
// Store match distance (D).
strh r8, [dst_ptr], #2
sub r6, mch_len, #15
b L_large_M
L_large_L:
// L is large, M may or may not be. We need to encode extra literal length
// bytes and we need to do bounds checks while store both those byte and the
// literal itself.
orr r10, #0xf0
strb r10, [dst_ptr], #1
// How many extra literal bytes do we need to store? We need to store
// (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is
// (L - 15)%255. Get these quantities via magic number multiplication:
// (L - 15)*0x80808081 >> (32 + 7)
umull r10, r11, r6, r12
mov r12, #255
lsr r10, r11, #7 // (L - 15) / 255
mls r11, r10, r12, r6 // (L - 15) % 255
ldr r12, L_magic // may need magic number again for M.
// Compute address dst_ptr will have after storing all 0xff bytes, and
// check that we won't exceed dst_end in doing so.
add r10, dst_ptr, r10
cmp r10, dst_end
bhs L_done
// There's enough space for all the 0xff bytes, so go ahead and store them.
0:vst1.8 q1, [dst_ptr]!
cmp dst_ptr, r10
blo 0b
// Store the (L - 15) % 255 byte.
strb r11, [r10], #1
// Compute the address we'll have reached after storing all literal bytes.
// If that passes dst_end, we're done.
add dst_ptr, r10, lit_len
cmp dst_ptr, dst_end
bhs L_done
// Copy the literal.
0:vld1.8 q0, [src_ptr]!
vst1.8 q0, [r10]!
subs r6, r10, dst_ptr
blo 0b
// Fixup src_ptr, store match distance (D), and check whether or not M is
// bigger than 14. If not, go find the next match.
strh r8, [dst_ptr], #2
sub src_ptr, r6
subs r6, mch_len, #15
bhs L_large_M
// M is small, so we're all done add mch_len, #4
add src_ptr, mch_len
b L_next_match
L_large_M:
// Just like with large L, we split (M - 15) into (M - 15) / 255 and
// (M - 15) % 255 via magic number multiply.
umull r10, r11, r6, r12
mov r12, #255
lsr r10, r11, #7 // (M - 15) / 255
mls r11, r10, r12, r6 // (M - 15) % 255
// Compute address dst_ptr will have after storing all 0xff bytes, and
// check that we won't exceed dst_end in doing so.
add r10, dst_ptr, r10
cmp r10, dst_end
bhs L_done
// There's enough space for all the 0xff bytes, so go ahead and store them.
0:vst1.8 q1, [dst_ptr]!
cmp dst_ptr, r10
blo 0b
// Store final M % 255 byte, update dst_ptr and src_ptr, and look for next
// match.
strb r11, [r10]
add mch_len, #4
add dst_ptr, r10, #1
add src_ptr, mch_len
b L_next_match
L_trailing_literal:
// Check if skip_final_literals is set.
ldr r5, [sp, #52]
tst r5, r5
bne L_done
// Emit a trailing literal that covers the remainder of the source buffer,
// if we can do so without exceeding the bounds of the destination buffer.
add src_end, margin
sub lit_len, src_end, src_ptr
subs r6, lit_len, #15
bhs L_trailing_literal_long
lsl r10, lit_len, #4
strb r10, [dst_ptr], #1
vld1.8 q0, [src_ptr]
mov src_ptr, src_end
vst1.8 q0, [dst_ptr]
add dst_ptr, lit_len
b L_done
L_trailing_literal_long:
ldr r12, L_magic
mov r10, #0xf0
add dst_end, margin
strb r10, [dst_ptr], #1
umull r10, r11, r6, r12
mov r12, #255
lsr r10, r11, #7 // (L - 15) / 255
mls r11, r10, r12, r6 // (L - 15) % 255
// We want to write out lit_len + (L - 15)/255 + 1 bytes. Check if we have
// space for all of them.
add r10, dst_ptr
add r12, r10, lit_len
cmp r12, dst_end
bhs L_done
// We have enough space, so go ahead and write them all out. Because we
// know that we have enough space, and that the literal is at least 15 bytes,
// we can write the block of 0xffs using vector stores, even without a
// safety margin.
0:vst1.8 q1, [dst_ptr]!
cmp dst_ptr, r10
blo 0b
// Store the (L - 15) % 255 byte.
strb r11, [r10], #1
mov dst_ptr, r10
// Now store the literal itself // buffer or read past the end of the source buffer.
subs lit_len, #16
blo 1f
0:vld1.8 q0, [src_ptr]!
subs lit_len, #16
vst1.8 q0, [dst_ptr]!
bhs 0b
1:adds lit_len, #16
beq L_done
2:ldrb r6, [src_ptr], #1
subs lit_len, #1
strb r6, [dst_ptr], #1
bne 2b
b L_done
#endif