lz4_encode_armv7.s   [plain text]


/*
 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <vm/lz4_assembly_select.h>
#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7

/* void lz4_encode_2gb(uint8_t ** dst_ptr,
                       size_t dst_size,
                       const uint8_t ** src_ptr,
                       const uint8_t * src_begin,
                       size_t src_size,
                       lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
                       int skip_final_literals)                               */

.globl _lz4_encode_2gb
.syntax unified

#define dst_ptr r0
#define dst_end r1
#define src_ptr r2
#define src_beg r3
#define src_end r4
#define table   r5
#define mch_ptr r6
#define mch_dis r8
#define mch_len r9
#define mch_ref r10

#define margin  128

.macro establish_frame
  push   {r4-r7, lr}
  add     r7,       sp, #12
  push   {r8-r11}
  ldrd    r4, r5,  [sp, #36]
  push   {r0, r2}
  ldr     dst_ptr, [r0]
  ldr     src_ptr, [r2]
  subs    r1,       r1,  margin // subtract safety margin from dst_size
  bls     L_done
  add     dst_end,  dst_ptr, r1 // dst end - margin
  sub     r4,       r4,  margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop).
  add     src_end,  src_ptr, r4 // src end - margin.
  vmov.i8 q1,       #255        // vector of all 1s used to emit
.endm

.macro clear_frame_and_return
  pop    {r1, r3}
  str     dst_ptr, [r1]
  str     src_ptr, [r3]
  pop    {r8-r11}
  pop    {r4-r7, pc}
.endm

.p2align 4
_lz4_encode_2gb:
  establish_frame
L_next_match:
  //  Start searching for the next match, starting at the end of the last one.
  //  [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr
  //  within the search loop itself].  Load the hash magic number in lr, and
  //  zero out mch_len (when we find a match, its length will initially be
  //  four, but we actually work with the match length minus four at all times).
  ldr     lr,       L_hash
  sub     mch_ptr,  src_ptr, #1

L_match_candidate_loop:
  //  If mch_ptr >= src_end, we are near the end of the source buffer (remember
  //  that we subtracted margin from src_end, so we are *not* actually past the
  //  end just yet).
  cmp     mch_ptr,  src_end
  bhs     L_trailing_literal

  //  Load the four-byte word starting at mch_ptr, and get the address of the
  //  corresponding row of the hash table.
  ldr     r9,      [mch_ptr, #1]!
  sub     r8,       mch_ptr, src_beg
  mul     r12,      r9, lr
  mvn     r10,      #0x7
  and     r12,      r10, r12, lsr #17 // byte offset of table entry.

  //  Load offset and word from hash table row, then update with the new offset
  //  and word that we just computed.
  ldrd    r10,r11, [table, r12]
  strd    r8, r9,  [table, r12]

  //  At this point, we only know that the hashes of the words match; check to
  //  see if the words themselves match.  If not, move on to the next candidate.
  cmp     r9,       r11
  bne     L_match_candidate_loop

  //  It's not enough for the words to match; the match distance must also be
  //  in the representable range (i.e. less than 0x10000).
  sub     mch_dis,  r8, r10
  add     mch_ref,  src_beg, r10
  cmp     mch_dis,  #0x10000
  bhs     L_match_candidate_loop

  //  We have found a match; registers at this point are as follows:
  //
  //   register   symbolic name   meaning
  //      r0         dst_ptr      pointer into destination buffer where the
  //                              match information will be stored.
  //      r1         dst_end      pointer to the end of the destination buffer,
  //                              less margin.
  //      r2         src_ptr      pointer to the byte after the last match that
  //                              we found, or to the point from which we
  //                              started searching if this is the first match.
  //      r3         src_beg      pointer to the actual start of the buffer.
  //      r4         src_end      pointer to the end of the source buffer, less
  //                              margin.
  //      r5         table        address of hash table.
  //      r6         mch_ptr      pointer to match.
  //      r8         mch_dis      match distance ("D")
  //      r9         mch_len      length of match less four ("M")
  //      r10        mch_ref      pointer to match reference.
  //      r11        -
  //      r12        -
  //      lr         -
  //
  //  Attempt to grow the match backwards (typically we only grow backwards by
  //  a byte or two, if at all, so we use a byte-by-byte scan).
  eor     mch_len,  mch_len
0:cmp     mch_ref,  src_beg
  cmpne   mch_ptr,  src_ptr
  beq     1f
  ldrb    r11,     [mch_ref, #-1]
  ldrb    r12,     [mch_ptr, #-1]
  cmp     r11,      r12
  bne     1f
  sub     mch_ref,  #1
  sub     mch_ptr,  #1
  add     mch_len,  #1
  b       0b

  //  Now that we have the start of the match, we can compute the literal
  //  length.  Then advance the mch and ref pointers to the end of the match
  //  and its reference.  Because mch_len is the real match length minus four,
  //  we actually advance to four before the end of the match, but our loop
  //  to grow the matches uses pre-incremented loads with writeback, so this
  //  works out correctly.
#define lit_len lr
1:sub     lit_len,  mch_ptr, src_ptr
  add     mch_ptr,  mch_len
  add     mch_ref,  mch_len

  //  Now attempt to grow the match forwards.  This is much more common, and
  //  there is a safety margin at the end of the buffer, so we grow forwards
  //  in four-byte chunks.
0:ldr     r11,     [mch_ptr, #4]!
  ldr     r12,     [mch_ref, #4]!
  eors    r11,      r12
  bne     1f
  add     mch_len,  #4
  cmp     mch_ptr,  src_end
  blo     0b
  b       L_emit_match
  //  At least one of the bytes in the last comparison did not match.  Identify
  //  which byte had the mismatch and compute the final length (less four).
1:rev     r11,      r11
  clz     r11,      r11
  add     mch_len,  r11, lsr #3

L_emit_match:
  //  Time to emit what we've found!
  //
  //   register   symbolic name   meaning
  //      r0         dst_ptr      pointer into destination buffer where the
  //                              match information will be stored.
  //      r1         dst_end      pointer to the end of the destination buffer,
  //                              less margin.
  //      r2         src_ptr      pointer to the byte after the last match that
  //                              we found, or to the point from which we
  //                              started searching if this is the first match.
  //      r3         src_beg      pointer to the actual start of the buffer.
  //      r4         src_end      pointer to the end of the source buffer, less
  //                              margin.
  //      r5         table        address of hash table.
  //      r6         -
  //      r8         mch_dis      match distance ("D")
  //      r9         mch_len      length of match ("M")
  //      r10        -
  //      r11        -
  //      r12        -
  //      lr         lit_len      literal length ("L")
  //      q1                      vector of all ones
  //
  //  Synthesize control byte under the assumption that L and M are both less
  //  than 15, jumping out of the fast path if one of them is not.
  cmp     lit_len,  #15
  orr     r10,      mch_len, lit_len, lsl #4
  cmplo   mch_len,  #15
  bhs     L_emit_careful
  //  L and M are both less than 15, which means (a) we use the most compact
  //  encoding for the match and (b) we do not need to do a bounds check on
  //  the destination buffer before writing, only before continuing our search.
  //  Store the command byte.
  strb    r10,     [dst_ptr], #1
  //  Copy literal.
  vld1.8  q0,      [src_ptr]
  add     src_ptr,  lit_len
  vst1.8  q0,      [dst_ptr]
  add     dst_ptr,  lit_len
  //  Restore "true" match length before updating src_ptr.
  add     mch_len,  #4
  //  Store match distance (D) and update the source pointer.
  strh    r8,      [dst_ptr], #2
  add     src_ptr,  mch_len
  //  If we're not into the safety margin of the destination buffer, look for
  //  another match.
  cmp     dst_ptr,  dst_end
  blo     L_next_match
  //  If we *are* into the safety margin of the destination buffer, we're done
  //  encoding this block; update the source and destination pointers and
  //  return.
L_done:
  clear_frame_and_return

//  Constant island
L_hash: .long 2654435761
L_magic: .long 0x80808081

L_emit_careful:
  //  Either L or M is >= 15, which means that we don't get to use the compact
  //  encoding, and that we need to do extra bounds checks while writing.
  //
  //   register   symbolic name   meaning
  //      r0         dst_ptr      pointer into destination buffer where the
  //                              match information will be stored.
  //      r1         dst_end      pointer to the end of the destination buffer,
  //                              less margin.
  //      r2         src_ptr      pointer to the byte after the last match that
  //                              we found, or to the point from which we
  //                              started searching if this is the first match.
  //      r3         src_beg      pointer to the actual start of the buffer.
  //      r4         src_end      pointer to the end of the source buffer, less
  //                              margin.
  //      r5         table        address of hash table.
  //      r6         -
  //      r8         mch_dis      match distance ("D")
  //      r9         mch_len      length of match ("M") less four
  //      r10        -
  //      r11        -
  //      r12        -
  //      lr         lit_len      literal length ("L")
  //      q1                      vector of all ones
  //
  //  Start by creating the low 4 bits of the control word; M if M < 15, 0xf
  //  otherwise.  We also load 0x80808081, which is the magic number for
  //  division by 255; this will be required later on.
  ldr     r12,      L_magic
  cmp     mch_len,  #15
  mov     r10,      mch_len
  movhs   r10,      #0x0f
  subs    r6,       lit_len, #15
  bhs     L_large_L
  //  M is large, but L is < 15.  This means we can use the simple approach
  //  for copying the literal with no bounds checks.
  orr     r10,      lit_len, lsl #4
  strb    r10,     [dst_ptr], #1
  //  Copy literal.
  vld1.8  q0,      [src_ptr]
  add     src_ptr,  lit_len
  vst1.8  q0,      [dst_ptr]
  add     dst_ptr,  lit_len
  //  Store match distance (D).
  strh    r8,      [dst_ptr], #2
  sub     r6,       mch_len, #15
  b       L_large_M

L_large_L:
  //  L is large, M may or may not be.  We need to encode extra literal length
  //  bytes and we need to do bounds checks while store both those byte and the
  //  literal itself.
  orr     r10,      #0xf0
  strb    r10,     [dst_ptr], #1
  //  How many extra literal bytes do we need to store?  We need to store
  //  (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is
  //  (L - 15)%255.  Get these quantities via magic number multiplication:
  //  (L - 15)*0x80808081 >> (32 + 7)
  umull   r10, r11, r6, r12
  mov     r12,      #255
  lsr     r10,      r11, #7       // (L - 15) / 255
  mls     r11,      r10, r12, r6  // (L - 15) % 255
  ldr     r12,      L_magic       // may need magic number again for M.
  //  Compute address dst_ptr will have after storing all 0xff bytes, and
  //  check that we won't exceed dst_end in doing so.
  add     r10,      dst_ptr, r10
  cmp     r10,      dst_end
  bhs     L_done
  //  There's enough space for all the 0xff bytes, so go ahead and store them.
0:vst1.8  q1,      [dst_ptr]!
  cmp     dst_ptr,  r10
  blo     0b
  //  Store the (L - 15) % 255 byte.
  strb    r11,     [r10], #1
  //  Compute the address we'll have reached after storing all literal bytes.
  //  If that passes dst_end, we're done.
  add     dst_ptr,  r10, lit_len
  cmp     dst_ptr,  dst_end
  bhs     L_done
  //  Copy the literal.
0:vld1.8  q0,      [src_ptr]!
  vst1.8  q0,      [r10]!
  subs    r6,       r10, dst_ptr
  blo     0b
  //  Fixup src_ptr, store match distance (D), and check whether or not M is
  //  bigger than 14.  If not, go find the next match.
  strh    r8,      [dst_ptr], #2
  sub     src_ptr,  r6
  subs    r6,       mch_len, #15
  bhs     L_large_M
  //  M is small, so we're all done; we just need to update the source pointer
  //  and we can go look for the next match.
  add     mch_len,  #4
  add     src_ptr,  mch_len
  b       L_next_match

L_large_M:
  //  Just like with large L, we split (M - 15) into (M - 15) / 255 and
  //  (M - 15) % 255 via magic number multiply.
  umull   r10, r11, r6, r12
  mov     r12,      #255
  lsr     r10,      r11, #7       // (M - 15) / 255
  mls     r11,      r10, r12, r6  // (M - 15) % 255
  //  Compute address dst_ptr will have after storing all 0xff bytes, and
  //  check that we won't exceed dst_end in doing so.
  add     r10,      dst_ptr, r10
  cmp     r10,      dst_end
  bhs     L_done
  //  There's enough space for all the 0xff bytes, so go ahead and store them.
0:vst1.8  q1,      [dst_ptr]!
  cmp     dst_ptr,  r10
  blo     0b
  //  Store final M % 255 byte, update dst_ptr and src_ptr, and look for next
  //  match.
  strb    r11,     [r10]
  add     mch_len,  #4
  add     dst_ptr,  r10, #1
  add     src_ptr,  mch_len
  b       L_next_match

L_trailing_literal:
  //  Check if skip_final_literals is set.
  ldr     r5,      [sp, #52]
  tst     r5,       r5
  bne     L_done
  //  Emit a trailing literal that covers the remainder of the source buffer,
  //  if we can do so without exceeding the bounds of the destination buffer.
  add     src_end,  margin
  sub     lit_len,  src_end, src_ptr
  subs    r6,       lit_len, #15
  bhs     L_trailing_literal_long
  lsl     r10,      lit_len, #4
  strb    r10,     [dst_ptr], #1
  vld1.8  q0,      [src_ptr]
  mov     src_ptr,  src_end
  vst1.8  q0,      [dst_ptr]
  add     dst_ptr,  lit_len
  b       L_done

L_trailing_literal_long:
  ldr     r12,      L_magic
  mov     r10,      #0xf0
  add     dst_end,  margin
  strb    r10,     [dst_ptr], #1
  umull   r10, r11, r6, r12
  mov     r12,      #255
  lsr     r10,      r11, #7       // (L - 15) / 255
  mls     r11,      r10, r12, r6  // (L - 15) % 255
  //  We want to write out lit_len + (L - 15)/255 + 1 bytes.  Check if we have
  //  space for all of them.
  add     r10,      dst_ptr
  add     r12,      r10, lit_len
  cmp     r12,      dst_end
  bhs     L_done
  //  We have enough space, so go ahead and write them all out.  Because we
  //  know that we have enough space, and that the literal is at least 15 bytes,
  //  we can write the block of 0xffs using vector stores, even without a
  //  safety margin.
0:vst1.8  q1,      [dst_ptr]!
  cmp     dst_ptr,  r10
  blo     0b
  //  Store the (L - 15) % 255 byte.
  strb    r11,     [r10], #1
  mov     dst_ptr,  r10
  //  Now store the literal itself; here we need to actually be somewhat
  //  careful to ensure that we don't write past the end of the destination
  //  buffer or read past the end of the source buffer.
  subs    lit_len,  #16
  blo     1f
0:vld1.8  q0,      [src_ptr]!
  subs    lit_len,  #16
  vst1.8  q0,      [dst_ptr]!
  bhs     0b
1:adds    lit_len,  #16
  beq     L_done
2:ldrb    r6,      [src_ptr], #1
  subs    lit_len,  #1
  strb    r6,      [dst_ptr], #1
  bne     2b
  b       L_done

#endif