svnpredumpfilter.py   [plain text]


#!/usr/bin/env python

# ====================================================================
#    Licensed to the Apache Software Foundation (ASF) under one
#    or more contributor license agreements.  See the NOTICE file
#    distributed with this work for additional information
#    regarding copyright ownership.  The ASF licenses this file
#    to you under the Apache License, Version 2.0 (the
#    "License"); you may not use this file except in compliance
#    with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing,
#    software distributed under the License is distributed on an
#    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#    KIND, either express or implied.  See the License for the
#    specific language governing permissions and limitations
#    under the License.
# ====================================================================
"""\
Usage:  1. {PROGRAM} [OPTIONS] include INCLUDE-PATH ...
        2. {PROGRAM} [OPTIONS] exclude EXCLUDE-PATH ...

Read a Subversion revision log output stream from stdin, analyzing its
revision log history to see what paths would need to be additionally
provided as part of the list of included/excluded paths if trying to
use Subversion's 'svndumpfilter' program to include/exclude paths from
a full dump of a repository's history.

The revision log stream should be the result of 'svn log -v' or 'svn
log -vq' when run against the root of the repository whose history
will be filtered by a user with universal read access to the
repository's data.  Do not use the --use-merge-history (-g) or
--stop-on-copy when generating this revision log stream.
Use the default ordering of revisions (that is, '-r HEAD:0').

Return errorcode 0 if there are no additional dependencies found, 1 if
there were; any other errorcode indicates a fatal error.

Paths in mergeinfo are not considered as additional dependencies so the
--skip-missing-merge-sources option of 'svndumpfilter' may be required
for successful filtering with the resulting path list.

Options:

   --help (-h)           Show this usage message and exit.

   --targets FILE        Read INCLUDE-PATHs and EXCLUDE-PATHs from FILE,
                         one path per line.

   --verbose (-v)        Provide more information.  May be used multiple
                         times for additional levels of information (-vv).
"""
import sys
import os
import getopt
import string

verbosity = 0

class LogStreamError(Exception): pass
class EOFError(Exception): pass

EXIT_SUCCESS = 0
EXIT_MOREDEPS = 1
EXIT_FAILURE = 2

def sanitize_path(path):
  return '/'.join(filter(None, path.split('/')))

def subsumes(path, maybe_child):
  if path == maybe_child:
    return True
  if maybe_child.startswith(path + '/'):
    return True
  return False

def compare_paths(path1, path2):
  # Are the paths exactly the same?
  if path1 == path2:
    return 0

  # Skip past common prefix
  path1_len = len(path1);
  path2_len = len(path2);
  min_len = min(path1_len, path2_len)
  i = 0
  while (i < min_len) and (path1[i] == path2[i]):
    i = i + 1

  # Children of paths are greater than their parents, but less than
  # greater siblings of their parents
  char1 = '\0'
  char2 = '\0'
  if (i < path1_len):
    char1 = path1[i]
  if (i < path2_len):
    char2 = path2[i]

  if (char1 == '/') and (i == path2_len):
    return 1
  if (char2 == '/') and (i == path1_len):
    return -1
  if (i < path1_len) and (char1 == '/'):
    return -1
  if (i < path2_len) and (char2 == '/'):
    return 1

  # Common prefix was skipped above, next character is compared to
  # determine order
  return cmp(char1, char2)

def log(msg, min_verbosity):
  if verbosity >= min_verbosity:
    if min_verbosity == 1:
      sys.stderr.write("[* ] ")
    elif min_verbosity == 2:
      sys.stderr.write("[**] ")
    sys.stderr.write(msg + "\n")

class DependencyTracker:
  def __init__(self, include_paths):
    self.include_paths = set(include_paths)
    self.dependent_paths = set()

  def path_included(self, path):
    for include_path in self.include_paths | self.dependent_paths:
      if subsumes(include_path, path):
        return True
    return False

  def include_missing_copies(self, path_copies):
    while True:
      log("Cross-checking %d included paths with %d copies "
          "for missing path dependencies..." % (
            len(self.include_paths) + len(self.dependent_paths),
            len(path_copies)),
          1)
      included_copies = []
      for path, copyfrom_path in path_copies:
        if self.path_included(path):
          log("Adding copy '%s' -> '%s'" % (copyfrom_path, path), 1)
          self.dependent_paths.add(copyfrom_path)
          included_copies.append((path, copyfrom_path))
      if not included_copies:
        log("Found all missing path dependencies", 1)
        break
      for path, copyfrom_path in included_copies:
        path_copies.remove((path, copyfrom_path))
      log("Found %d new copy dependencies, need to re-check for more"
        % len(included_copies), 1)

def readline(stream):
  line = stream.readline()
  if not line:
    raise EOFError("Unexpected end of stream")
  line = line.rstrip('\n\r')
  log(line, 2)
  return line

def svn_log_stream_get_dependencies(stream, included_paths):
  import re

  dt = DependencyTracker(included_paths)

  header_re = re.compile(r'^r([0-9]+) \|.*$')
  action_re = re.compile(r'^   [ADMR] /(.*)$')
  copy_action_re = re.compile(r'^   [AR] /(.*) \(from /(.*):[0-9]+\)$')
  line_buf = None
  last_revision = 0
  eof = False
  path_copies = set()
  found_changed_path = False

  while not eof:
    try:
      line = line_buf is not None and line_buf or readline(stream)
    except EOFError:
      break

    # We should be sitting at a log divider line.
    if line != '-' * 72:
      raise LogStreamError("Expected log divider line; not found.")

    # Next up is a log header line.
    try:
      line = readline(stream)
    except EOFError:
      break
    match = header_re.search(line)
    if not match:
      raise LogStreamError("Expected log header line; not found.")
    pieces = map(string.strip, line.split('|'))
    revision = int(pieces[0][1:])
    if last_revision and revision >= last_revision:
      raise LogStreamError("Revisions are misordered.  Make sure log stream "
                           "is from 'svn log' with the youngest revisions "
                           "before the oldest ones (the default ordering).")
    log("Parsing revision %d" % (revision), 1)
    last_revision = revision
    idx = pieces[-1].find(' line')
    if idx != -1:
      log_lines = int(pieces[-1][:idx])
    else:
      log_lines = 0

    # Now see if there are any changed paths.  If so, parse and process them.
    line = readline(stream)
    if line == 'Changed paths:':
      while 1:
        try:
          line = readline(stream)
        except EOFError:
          eof = True
          break
        match = copy_action_re.search(line)
        if match:
          found_changed_path = True
          path_copies.add((sanitize_path(match.group(1)),
                           sanitize_path(match.group(2))))
        elif action_re.search(line):
          found_changed_path = True
        else:
          break

    # Finally, skip any log message lines.  (If there are none,
    # remember the last line we read, because it probably has
    # something important in it.)
    if log_lines:
      for i in range(log_lines):
        readline(stream)
      line_buf = None
    else:
      line_buf = line

  if not found_changed_path:
    raise LogStreamError("No changed paths found; did you remember to run "
                         "'svn log' with the --verbose (-v) option when "
                         "generating the input to this script?")

  dt.include_missing_copies(path_copies)
  return dt

def analyze_logs(included_paths):
  print("Initial include paths:")
  for path in included_paths:
    print(" + /%s" % (path))

  dt = svn_log_stream_get_dependencies(sys.stdin, included_paths)

  if dt.dependent_paths:
    found_new_deps = True
    print("Dependent include paths found:")
    for path in dt.dependent_paths:
      print(" + /%s" % (path))
    print("You need to also include them (or one of their parents).")
  else:
    found_new_deps = False
    print("No new dependencies found!")
    parents = {}
    for path in dt.include_paths:
      while 1:
        parent = os.path.dirname(path)
        if not parent:
          break
        parents[parent] = 1
        path = parent
    parents = parents.keys()
    if parents:
      print("You might still need to manually create parent directories " \
            "for the included paths before loading a filtered dump:")
      parents.sort(compare_paths)
      for parent in parents:
        print("   /%s" % (parent))

  return found_new_deps and EXIT_MOREDEPS or EXIT_SUCCESS

def usage_and_exit(errmsg=None):
  program = os.path.basename(sys.argv[0])
  stream = errmsg and sys.stderr or sys.stdout
  stream.write(__doc__.replace("{PROGRAM}", program))
  if errmsg:
    stream.write("\nERROR: %s\n" % (errmsg))
  sys.exit(errmsg and EXIT_FAILURE or EXIT_SUCCESS)

def main():
  config_dir = None
  targets_file = None

  try:
    opts, args = getopt.getopt(sys.argv[1:], "hv",
                               ["help", "verbose", "targets="])
  except getopt.GetoptError as e:
    usage_and_exit(str(e))

  for option, value in opts:
    if option in ['-h', '--help']:
      usage_and_exit()
    elif option in ['-v', '--verbose']:
      global verbosity
      verbosity = verbosity + 1
    elif option in ['--targets']:
      targets_file = value

  if len(args) == 0:
    usage_and_exit("Not enough arguments")

  if targets_file is None:
    targets = args[1:]
  else:
    targets = map(lambda x: x.rstrip('\n\r'),
                  open(targets_file, 'r').readlines())
  if not targets:
    usage_and_exit("No target paths specified")

  try:
    if args[0] == 'include':
      sys.exit(analyze_logs(map(sanitize_path, targets)))
    elif args[0] == 'exclude':
      usage_and_exit("Feature not implemented")
    else:
      usage_and_exit("Valid subcommands are 'include' and 'exclude'")
  except SystemExit:
    raise
  except (LogStreamError, EOFError) as e:
    log("ERROR: " + str(e), 0)
    sys.exit(EXIT_FAILURE)
  except:
    import traceback
    exc_type, exc, exc_tb = sys.exc_info()
    tb = traceback.format_exception(exc_type, exc, exc_tb)
    sys.stderr.write(''.join(tb))
    sys.exit(EXIT_FAILURE)


if __name__ == "__main__":
    main()