prefix_string.c   [plain text]


/* prefix_string.c --- implement strings based on a prefix tree
 *
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

#include <assert.h>
#include "private/svn_string_private.h"

/* A node in the tree represents a common prefix.  The root node is the
 * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
 * a different character (possibly '\0').
 *
 * The nodes in the tree store only up to 8 chars of the respective common
 * prefix, i.e. longer common prefixes must be drawn out over multiple
 * hierarchy levels.  This is a space <-> efficiency trade-off.
 *
 * Strings are the leaf nodes in the tree and use a specialized, smaller
 * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
 * data types can be discerned by the last char in the data buffer.  This
 * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
 * ordinary nodes have a length information so that no terminating 0 is
 * required for them.
 */

/* forward declaration */
typedef struct node_t node_t;

/* String type and tree leaf.
 */
struct svn_prefix_string__t
{
  /* mandatory prefix */
  node_t *prefix;

  /* 0 ..7 chars to add the prefix.
   *
   * NUL-terminated, if this is indeed a tree leaf.  We use the same struct
   * within node_t for inner tree nodes, too.  There, DATA[7] is not NUL,
   * meaning DATA may or may not be NUL terminated.  The actual length is
   * provided by the node_t.length field (minus parent node length). */
  char data[8];
};

/* A node inside the tree, i.e. not a string and not a leaf (unless this is
 * the root node).
 *
 * Note: keep the ordering to minimize size / alignment overhead on 64 bit
 * machines.
 */
struct node_t
{
  /* pointer to the parent prefix plus the 1 .. 8 extra chars.
   * Only the root will provide 0 extra chars. */
  svn_prefix_string__t key;

  /* Length of the prefix from the root down to and including this one.
   * 0 for the root node.  Only then will key.prefix be NULL. */
  apr_uint32_t length;

  /* Number of entries used in SUB_NODES. */
  apr_uint32_t sub_node_count;

  /* The sub-nodes, ordered by first char.  node_t and svn_prefix_string__t
   * may be mixed here.  May be NULL.
   * The number of allocated entries is always a power-of-two and only
   * given implicitly by SUB_NODE_COUNT. */
  struct node_t **sub_nodes;
};

/* The actual tree structure. */
struct svn_prefix_tree__t
{
  /* the common tree root (represents the empty prefix). */
  node_t *root;

  /* all sub-nodes & strings will be allocated from this pool */
  apr_pool_t *pool;
};

/* Return TRUE, iff NODE is a leaf node.
 */
static svn_boolean_t
is_leaf(node_t *node)
{
  /* If this NOT a leaf node and this node has ...
   *    ... 8 chars, data[7] will not be NUL because we only support
   *        NUL-*terminated* strings.
   *    ... less than 8 chars, this will be set to 0xff
   *        (any other non-NUL would do as well but this is not valid UTF8
   *         making it easy to recognize during debugging etc.) */
  return node->key.data[7] == 0;
}

/* Ensure that the sub-nodes array of NODE within TREE has at least one
 * unused entry.  Re-allocate as necessary.
 */
static void
auto_realloc_sub_nodes(svn_prefix_tree__t *tree,
                       node_t *node)
{
  if (node->sub_node_count & (node->sub_node_count - 1))
    return;

  if (node->sub_node_count == 0)
    {
      node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
    }
  else
    {
      node_t **sub_nodes
        = apr_pcalloc(tree->pool,
                      2 * node->sub_node_count * sizeof(*sub_nodes));
      memcpy(sub_nodes, node->sub_nodes,
             node->sub_node_count * sizeof(*sub_nodes));
      node->sub_nodes = sub_nodes;
    }
}

/* Given the COUNT pointers in the SUB_NODES array, return the location at
 * which KEY is either located or would be inserted.
 */
static int
search_lower_bound(node_t **sub_nodes,
                   unsigned char key,
                   int count)
{
  int lower = 0;
  int upper = count - 1;

  /* Binary search for the lowest position at which to insert KEY. */
  while (lower <= upper)
    {
      int current = lower + (upper - lower) / 2;

      if ((unsigned char)sub_nodes[current]->key.data[0] < key)
        lower = current + 1;
      else
        upper = current - 1;
    }

  return lower;
}

svn_prefix_tree__t *
svn_prefix_tree__create(apr_pool_t *pool)
{
  svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
  tree->pool = pool;

  tree->root = apr_pcalloc(pool, sizeof(*tree->root));
  tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */

  return tree;
}

svn_prefix_string__t *
svn_prefix_string__create(svn_prefix_tree__t *tree,
                          const char *s)
{
  svn_prefix_string__t *new_string;
  apr_size_t len = strlen(s);
  node_t *node = tree->root;
  node_t *new_node;
  int idx;

  /* walk the existing tree until we either find S or the node at which S
   * has to be inserted */
  while (TRUE)
    {
      node_t *sub_node;
      int match = 1;

      /* index of the matching sub-node */
      idx = node->sub_node_count
          ? search_lower_bound(node->sub_nodes,
                               (unsigned char)s[node->length],
                               node->sub_node_count)
          : 0;

      /* any (partially) matching sub-nodes? */
      if (idx == (int)node->sub_node_count
          || node->sub_nodes[idx]->key.data[0] != s[node->length])
        break;

      /* Yes, it matches - at least the first character does. */
      sub_node = node->sub_nodes[idx];

      /* fully matching sub-node? */
      if (is_leaf(sub_node))
        {
          if (strcmp(sub_node->key.data, s + node->length) == 0)
            return &sub_node->key;
        }
      else
        {
          /* The string formed by the path from the root down to
           * SUB_NODE differs from S.
           *
           * Is it a prefix?  In that case, the chars added by SUB_NODE
           * must fully match the respective chars in S. */
          apr_size_t sub_node_len = sub_node->length - node->length;
          if (strncmp(sub_node->key.data, s + node->length,
                      sub_node_len) == 0)
            {
              node = sub_node;
              continue;
            }
        }

      /* partial match -> split
       *
       * At this point, S may either be a prefix to the string represented
       * by SUB_NODE, or they may diverge at some offset with
       * SUB_NODE->KEY.DATA .
       *
       * MATCH starts with 1 here b/c we already know that at least one
       * char matches.  Also, the loop will terminate because the strings
       * differ before SUB_NODE->KEY.DATA - either at the NUL terminator
       * of S or some char before that.
       */
      while (sub_node->key.data[match] == s[node->length + match])
        ++match;

      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
      new_node->key = sub_node->key;
      new_node->length = node->length + match;
      new_node->key.data[7] = '\xff';  /* This is not a leaf. See is_leaf(). */
      new_node->sub_node_count = 1;
      new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
      new_node->sub_nodes[0] = sub_node;

      memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);

      /* replace old sub-node with new one and continue lookup */
      sub_node->key.prefix = new_node;
      node->sub_nodes[idx] = new_node;
      node = new_node;
    }

  /* add sub-node(s) and final string */
  while (len - node->length > 7)
    {
      new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
      new_node->key.prefix = node;
      new_node->length = node->length + 8;
      memcpy(new_node->key.data, s + node->length, 8);

      auto_realloc_sub_nodes(tree, node);
      memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
              (node->sub_node_count - idx) * sizeof(node_t *));

      /* replace old sub-node with new one and continue lookup */
      node->sub_nodes[idx] = new_node;
      node->sub_node_count++;
      node = new_node;
      idx = 0;
    }

  new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
  new_string->prefix = node;
  memcpy(new_string->data, s + node->length, len - node->length);

  auto_realloc_sub_nodes(tree, node);
  memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
          (node->sub_node_count - idx) * sizeof(node_t *));

  node->sub_nodes[idx] = (node_t *)new_string;
  node->sub_node_count++;
  return new_string;
}

svn_string_t *
svn_prefix_string__expand(const svn_prefix_string__t *s,
                          apr_pool_t *pool)
{
  apr_size_t s_len = strlen(s->data);
  apr_size_t len = s->prefix->length + s_len;
  char *buffer = apr_palloc(pool, len + 1);

  svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
  result->data = buffer;
  result->len = len;
  buffer[len] = '\0';

  while (s->prefix)
    {
      memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
      len = s->prefix->length;
      s = &s->prefix->key;
    }

  return result;
}

int
svn_prefix_string__compare(const svn_prefix_string__t *lhs,
                           const svn_prefix_string__t *rhs)
{
  const node_t *lhs_parent = lhs->prefix;
  const node_t *rhs_parent = rhs->prefix;

  if (lhs == rhs)
    return 0;

  /* find the common root */
  while (lhs_parent != rhs_parent)
    {
      if (lhs_parent->length <= rhs_parent->length)
        {
          rhs = &rhs_parent->key;
          rhs_parent = rhs_parent->key.prefix;
        }
      else if (rhs_parent->length <= lhs_parent->length)
        {
          lhs = &lhs_parent->key;
          lhs_parent = lhs_parent->key.prefix;
        }

      /* same tree? */
      assert(lhs_parent && rhs_parent);
    }

  /* at the common root, strings will differ in the first follow-up char */
  return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
}