create_hash_table [plain text]

#! /usr/bin/perl -w
#
# Static Hashtable Generator
#
# (c) 2000-2002 by Harri Porten <porten@kde.org> and
#                  David Faure <faure@kde.org>
# Modified (c) 2004 by Nikolas Zimmermann <wildfox@kde.org>
#
# Part of the KJS library.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
#

use strict;

my $file = $ARGV[0];
shift;
my $findSize = 0;
my $includelookup = 0;

# Use -s as second argument to make it try many hash sizes
$findSize = 1 if (defined($ARGV[0]) && $ARGV[0] eq "-s");

# Use -i as second argument to make it include "lookup.h"
$includelookup = 1 if (defined($ARGV[0]) && $ARGV[0] eq "-i");

# Use -n as second argument to make it use the third argument as namespace parameter ie. -n KDOM
my $useNameSpace = $ARGV[1] if (defined($ARGV[0]) && $ARGV[0] eq "-n");

print STDERR "Creating hashtable for $file\n";
open(IN, $file) or die "No such file $file";

my @keys = ();
my @values = ();
my @attrs = ();
my @params = ();
my @hashes = ();
my @table = ();
my @links = ();

my $inside = 0;
my $name;
my $size;
my $hashsize;
my $banner = 0;
sub calcTable();
sub output();
sub hashValue($);

while (<IN>) {
  chop;
  s/^\s*//g;
  if (/^\#|^$/) {
      # comment. do nothing
    } elsif (/^\@begin/ && !$inside) {
      if (/^\@begin\s*([:_\w]+)\s*(\d+)\s*$/) {
        $inside = 1;
        $name = $1;
        $hashsize = $2;
      } else {
         printf STDERR "WARNING: \@begin without table name and hashsize, skipping $_\n";
      }
    } elsif (/^\@end\s*$/ && $inside) {

      if($findSize) {
	my $entriesnum=@keys;
        print STDERR "Table: $name   $entriesnum entries\n";
	for( my $i=3 ; $i<79 ; ++$i) { $hashsize=$i ; calcTable(); }
      } else {
        calcTable();
      }

      output();
      @keys = ();
      @values = ();
      @attrs = ();
      @params = ();
      @table = ();
      @links = ();
      @hashes = ();
      $inside = 0;
    } elsif (/^(\S+)\s*(\S+)\s*([\w\|]*)\s*(\w*)\s*$/ && $inside) {
      my $key = $1;
      my $val = $2;
      my $att = $3;
      my $param = $4;
      push(@keys, $key);
      push(@values, $val);
      push(@hashes, hashValue($key));
      printf STDERR "WARNING: Number of arguments missing for $key/$val\n"
        if ( $att =~ m/Function/ && length($param) == 0);
      push(@attrs, length($att) > 0 ? $att : "0");
      push(@params, length($param) > 0 ? $param : "0");
    } elsif ($inside) {
      die "invalid data {" . $_ . "}";
    }
}

die "missing closing \@end" if ($inside);

sub calcTable() {
  $size = $hashsize;
  my $collisions = 0;
  my $maxdepth = 0;
  my $i = 0;
  foreach my $key (@keys) {
    my $depth = 0;
    my $h = hashValue($key) % $hashsize;
    while (defined($table[$h])) {
      if (defined($links[$h])) {
	$h = $links[$h];
	$depth++;
      } else {
	$collisions++;
	$links[$h] = $size;
	$h = $size;
	$size++;
      }
    }
    #print STDERR "table[$h] = $i\n";
    $table[$h] = $i;
    $i++;
    $maxdepth = $depth if ( $depth > $maxdepth);
  }

  # Ensure table is big enough (in case of undef entries at the end)
  if ( $#table+1 < $size ) {
    $#table = $size-1;
  }
  #print STDERR "After loop: size=$size table=".($#table+1)."\n";

  if ($findSize) {
    my $emptycount = 0;
    foreach my $entry (@table) {
      $emptycount++ if (!defined($entry));
    }
    print STDERR "Hashsize: $hashsize  Total Size: $size Empty: $emptycount MaxDepth: $maxdepth Collisions: $collisions\n";
  }
#  my $debugtable = 0;
#  foreach my $entry (@table) {
#    print STDERR "$debugtable " . (defined $entry ? $entry : '<undefined>');
#    print STDERR " -> " . $links[$debugtable] if (defined($links[$debugtable]));
#    print STDERR "\n";
#    $debugtable++;
#  }
}

sub leftShift($$) {
    my ($value, $distance) = @_;
    return (($value << $distance) & 0xFFFFFFFF);
}

# Paul Hsieh's SuperFastHash
# http://www.azillionmonkeys.com/qed/hash.html
# Ported from UString..
sub hashValue($) {
  my @chars = split(/ */, $_[0]);

  # This hash is designed to work on 16-bit chunks at a time. But since the normal case
  # (above) is to hash UTF-16 characters, we just treat the 8-bit chars as if they
  # were 16-bit chunks, which should give matching results

  my $EXP2_32 = 4294967296;

  my $hash = 0x9e3779b9;
  my $l    = scalar @chars; #I wish this was in Ruby --- Maks
  my $rem  = $l & 1;
  $l = $l >> 1;

  my $s = 0;

  # Main loop
  for (; $l > 0; $l--) {
    $hash   += ord($chars[$s]);
    my $tmp = leftShift(ord($chars[$s+1]), 11) ^ $hash;
    $hash   = (leftShift($hash, 16)% $EXP2_32) ^ $tmp;
    $s += 2;
    $hash += $hash >> 11;
  }

  # Handle end case
  if ($rem !=0) {
    $hash += ord($chars[$s]);
    $hash ^= (leftShift($hash, 11)% $EXP2_32);
    $hash += $hash >> 17;
  }

  # Force "avalanching" of final 127 bits
  $hash ^= leftShift($hash, 3);
  $hash += ($hash >> 5);
  $hash = ($hash% $EXP2_32);
  $hash ^= (leftShift($hash, 2)% $EXP2_32);
  $hash += ($hash >> 15);
  $hash = $hash% $EXP2_32;
  $hash ^= (leftShift($hash, 10)% $EXP2_32);
  
  # this avoids ever returning a hash code of 0, since that is used to
  # signal "hash not computed yet", using a value that is likely to be
  # effectively the same as 0 when the low bits are masked
  $hash = 0x80000000  if ($hash == 0);

  return $hash;
}

sub output() {
  if (!$banner) {
    $banner = 1;
    print "/* Automatically generated from $file using $0. DO NOT EDIT ! */\n";
  }

  my $nameEntries = "${name}Entries";
  $nameEntries =~ s/:/_/g;

  print "\n#include \"lookup.h\"\n" if ($includelookup);
  if ($useNameSpace) {
    print "\nnamespace ${useNameSpace}\n{\n";
    print "\nusing namespace KJS;";
  } else {
    print "\nnamespace KJS {\n";
  }
  print "\nstatic const struct HashEntry ".$nameEntries."[] = {\n";
  my $i = 0;
  #print STDERR "writing out table with ".($#table+1)." entries\n";
  
  if ($hashsize eq 0) {
    # To make the hash table lookup code fast, we don't allow tables of size 0.
    # That way it can do a modulo by the size without a special case to avoid division by 0.
    print "   \{ 0, 0, 0, 0, 0 \}\n";
    $hashsize = 1;
    $size = 1;
  } else {
    foreach my $entry (@table) {
      if (defined($entry)) {
        my $key = $keys[$entry];
        print "   \{ \"" . $key . "\"";
        print ", " . $values[$entry];
        print ", " . $attrs[$entry];
        print ", " . $params[$entry];
        print ", ";
        if (defined($links[$i])) {
          print "&" . $nameEntries . "[" . $links[$i] . "]" . " \}";
        } else {
          print "0 \}"
        }
        print "/* " . $hashes[$entry] . " */ ";
      } else {
        print "   \{ 0, 0, 0, 0, 0 \}";
      }    
      print "," unless ($i == $size - 1);
      print "\n";
      $i++;
    }
  }
  print "};\n\n";
  print "const struct HashTable $name = ";
  print "\{ 2, $size, ".$nameEntries.", $hashsize \};\n\n";
  print "} // namespace\n";
}