analyze-svnlogs.py   [plain text]


#!/usr/bin/env python
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#
#
# Generate a report of each area each committer has touched over all time.
#
# $ svn log -v ^/ > svnlogdata
# $ ./analyze-svnlogs.py < svnlogdata > report.txt
#
# NOTE: ./logdata.py is written with a cached version of the data extracted
#       from 'svnlogdata'. That data can be analyzed in many ways, beyond
#       what this script is reporting.
#

import sys
import re


RE_LOG_HEADER = re.compile('^(r[0-9]+) '
                           '\| ([^|]+) '
                           '\| ([^|]+) '
                           '\| ([0-9]+) line')
RE_PATH = re.compile(r'   [MARD] (.*?)( \(from .*\))?$')
SEPARATOR = '-' * 72


def parse_one_commit(logfile):
  line = logfile.readline().strip()
  if line != SEPARATOR:
    raise ParseError('missing separator: %s' % line)

  line = logfile.readline()
  if not line:
    # end of file!
    return None, None

  m = RE_LOG_HEADER.match(line)
  if not m:
    raise ParseError('could not match log header')
  revision = m.group(1)
  author = m.group(2)
  num_lines = int(m.group(4))
  paths = set()

  # skip "Changed paths:"
  line = logfile.readline().strip()
  if not line:
    # there were no paths. just a blank before the log message. continue on.
    sys.stderr.write('Funny revision: %s\n' % revision)
  else:
    if not line.startswith('Changed'):
      raise ParseError('log not run with -v. paths missing in %s' % revision)

    # gather all the affected paths
    while 1:
      line = logfile.readline().rstrip()
      if not line:
        # just hit end of the changed paths
        break
      m = RE_PATH.match(line)
      if not m:
        raise ParseError('bad path in %s: %s' % (revision, line))
      paths.add(m.group(1))

  # suck up the log message
  for i in range(num_lines):
    logfile.readline()

  return author, paths


def parse_file(logfile):
  authors = { }

  while True:
    author, paths = parse_one_commit(logfile)
    if author is None:
      return authors

    if author in authors:
      authors[author] = authors[author].union(paths)
    else:
      authors[author] = paths


def write_logdata(authors):
  out = open('logdata.py', 'w')
  out.write('authors = {\n')
  for author, paths in authors.items():
    out.write("  '%s': set([\n" % author)
    for path in paths:
      out.write('    %s,\n' % repr(path))
    out.write('  ]),\n')
  out.write('}\n')


def get_key(sectionroots, path):
  key = None
  for section in sectionroots:
    if path.startswith(section):
      # add one path element below top section to the key.
      elmts = len(section.split('/')) + 1
      # strip first element (always empty because path starts with '/')
      key = tuple(path.split('/', elmts)[1:elmts])
      break
  if key == None:
    # strip first element (always empty because path starts with '/')
    key = tuple(path.split('/', 3)[1:3])
  return key


def print_report(authors, sectionroots=[ ]):
  for author, paths in sorted(authors.items()):
    topdirs = { }
    for path in paths:
      key = get_key(sectionroots, path)
      if key in topdirs:
        topdirs[key] += 1
      else:
        topdirs[key] = 1

    print(author)
    tags = [ ]
    branches = [ ]
    for topdir in sorted(topdirs):
      if len(topdir) == 1:
        assert topdirs[topdir] == 1
        print('  %s  (ROOT)' % topdir[0])
      else:
        if topdir[0] == 'tags':
          if not topdir[1] in tags:
            tags.append(topdir[1])
        elif topdir[0] == 'branches':
          if not topdir[1] in branches:
            branches.append(topdir[1])
        else:
          print('  %s (%d items)' % ('/'.join(topdir), topdirs[topdir]))
    if tags:
      print('  TAGS: %s' % ', '.join(tags))
    if branches:
      print('  BRANCHES: %s' % ', '.join(branches))

    print('')


def run(logfile):
  try:
    import logdata
    authors = logdata.authors
  except ImportError:
    authors = parse_file(logfile)
    write_logdata(authors)

  sectionroots = [
      '/trunk/subversion/include/private',
      '/trunk/subversion/include',
      '/trunk/subversion/tests',
      '/trunk/subversion',
      '/trunk/tools',
      '/trunk/contrib',
      '/trunk/doc',
      ];
  print_report(authors, sectionroots)


class ParseError(Exception):
  pass


if __name__ == '__main__':
  if len(sys.argv) > 1:
    logfile = open(sys.argv[1])
  else:
    logfile = sys.stdin
  run(logfile)