normalize-dump.py [plain text]

#!/usr/bin/env python

import sys
import re

header_re = re.compile(r'^([^:]*): ?(.*)$')

class NodePath:
    def __init__(self, path, headers):
        self.path = path
        self.headers = headers

    def dump(self):
        print((' ' * 3) + self.path)
        headers = sorted(self.headers.keys())
        for header in headers:
            print((' ' * 6) + header + ': ' + self.headers[header])


def dump_revision(rev, nodepaths):
    sys.stderr.write('* Normalizing revision ' + rev + '...')
    print('Revision ' + rev)
    paths = sorted(nodepaths.keys())
    for path in paths:
        nodepath = nodepaths[path]
        nodepath.dump()
    sys.stderr.write('done\n')



def parse_header_block(fp):
    headers = {}
    while 1:
        line = fp.readline()
        if line == '':
            return headers, 1
        line = line.strip()
        if line == '':
            return headers, 0
        matches = header_re.match(line)
        if not matches:
            raise Exception('Malformed header block')
        headers[matches.group(1)] = matches.group(2)


def parse_file(fp):
    nodepaths = {}
    current_rev = None

    while 1:
        # Parse a block of headers
        headers, eof = parse_header_block(fp)

        # This is a revision header block
        if 'Revision-number' in headers:

            # If there was a previous revision, dump it
            if current_rev:
                dump_revision(current_rev, nodepaths)

            # Reset the data for this revision
            current_rev = headers['Revision-number']
            nodepaths = {}

            # Skip the contents
            prop_len = headers.get('Prop-content-length', 0)
            fp.read(int(prop_len))

        # This is a node header block
        elif 'Node-path' in headers:

            # Make a new NodePath object, and add it to the
            # dictionary thereof
            path = headers['Node-path']
            node = NodePath(path, headers)
            nodepaths[path] = node

            # Skip the content
            text_len = headers.get('Text-content-length', 0)
            prop_len = headers.get('Prop-content-length', 0)
            fp.read(int(text_len) + int(prop_len))

        # Not a revision, not a node -- if we've already seen at least
        # one revision block, we are in an errorful state.
        elif current_rev and len(headers.keys()):
            raise Exception('Header block from outta nowhere')

        if eof:
            if current_rev:
                dump_revision(current_rev, nodepaths)
            break

def usage():
    print('Usage: ' + sys.argv[0] + ' [DUMPFILE]')
    print('')
    print('Reads a Subversion dumpfile from DUMPFILE (or, if not provided,')
    print('from stdin) and normalizes the metadata contained therein,')
    print('printing summarized and sorted information.  This is useful for')
    print('generating data about dumpfiles in a diffable fashion.')
    sys.exit(0)

def main():
    if len(sys.argv) > 1:
        if sys.argv[1] == '--help':
            usage()
        fp = open(sys.argv[1], 'rb')
    else:
        fp = sys.stdin
    parse_file(fp)


if __name__ == '__main__':
    main()