svn-vendor.py   [plain text]


#!/usr/bin/python3
# vim: set sw=4 expandtab :
# ====================================================================
#    Licensed to the Apache Software Foundation (ASF) under one
#    or more contributor license agreements.  See the NOTICE file
#    distributed with this work for additional information
#    regarding copyright ownership.  The ASF licenses this file
#    to you under the Apache License, Version 2.0 (the
#    "License"); you may not use this file except in compliance
#    with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing,
#    software distributed under the License is distributed on an
#    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#    KIND, either express or implied.  See the License for the
#    specific language governing permissions and limitations
#    under the License.
# ====================================================================
#
##############################################################################
# svn-vendor.py
#
# Overview
# --------
#   Replacement for svn_load_dirs.pl (included as a 'contributed utility' in
#   Subversion sources). Main difference is some heuristics in detection of
#   the renames. Note that this script does not attempt to automate remote
#   SVN operations (check-out, check-in and tagging), so it is possible to
#   review the state of sources that are about to be checked in. Another
#   difference is an ability to save the detected renames, review/re-apply
#   them.
#
#   This script requires Python 3.3.x or higher. Sorry, I was too lazy
#   to write shell quoting routines that are already available in recent
#   Python versions.
#
# Using this script
# -----------------
#   First, it is necessary to check out the working copy from the URL that
#   will host the imported sources. E.g., if the versions of FOO are being
#   imported into svn://example.com/vendor/FOO/current:
#
#     svn co svn://example.com/vendor/FOO/current wc
#
#   Then, unpack the sources of the version to be imported:
#
#     tar xzf foo-1.1.tar.gz
#
#   Examples below assume the command above created a `foo-1.1' directory.
#   After that, there are three different modes of operation:
#
#   1. Fully automatic
#
#     svn-vendor.py --auto wc foo-1.1
#     svn st wc
#     svn ci wc
#
#   In this mode, the script fully relies on its heuristics in detection of
#   renames. In many cases, it "just works". There can be spurious moves
#   detected in this mode, though. For example, consider a deleted header
#   that consists of 50 lines of GPL text,  1 line of copyright, and
#   3 lines of declarations, and a similar unrelated header in the imported
#   sources. From the script's point of view, the files are nearly identical
#   (4 lines removed, 4 lines added, 50 lines unchanged).
#
#   After the script completes, examine the working copy by doing 'svn diff'
#   and/or 'svn status', paying particular attention to renames. If all the
#   moves are detected correctly, check in the changes in the working copy.
#
#   2. Semi-automatic
#
#     svn-vendor.py --detect moves-foo-1.1.txt wc foo-1.1
#     vi moves-foo-1.1.txt
#     svn-vendor.py --apply moves-foo-1.1.txt wc foo-1.1
#     svn ci wc
#
#   If the fully automatic mode mis-detected some spurious moves, or did not
#   detect some renames you want to be performed, it is still possible to
#   leverage what the script has detected automatically. First command above
#   does the automatic detection, just as it does in fully automatic mode,
#   but stops short of performing any modification of the working copy.
#   The list of detected copies and renames is saved into a text file,
#   `moves-foo-1.1.txt'.
#
#   That file can be inspected after the script finishes. Spurious moves can
#   be deleted from the file, and new copies/renames can be added. Then the
#   changes can be applied to the working copy.
#
#   3. Manual
#
#     svn-vendor.py wc foo-1.1
#     (svn-vendor) detect
#     (svn-vendor) move x.c y.c
#     (svn-vendor) move include/1.h include/2.h
#     (svn-vendor) copy include/3.h include/3-copy.h
#     (svn-vendor) lsprep
#     (svn-vendor) save /tmp/renames-to-be-applied.txt
#     (svn-vendor) apply
#
#   If the automatic detection does not help, it is possible to do the renames
#   manually (similarly to svn_load_dirs.pl). Use the 'help' command to get
#   the list of supported commands and their description. Feel free to play
#   around - since the script does not perform any remote SVN operation,
#   there is no chance to commit the changes accidentally.
#
# Notes
# -----
#   I. The time for rename detection O(Fs*Fd) + O(Ds*Dd), where Fs is
#   the number of files removed from current directory, Fd is number of files
#   added in imported sources, and Ds/Dd is the same for directories. That is,
#   the running time may become an issue if the numbers of added/removed files
#   go into a few thousands (e.g. if updating Linux kernel 2.6.35 to 3.10).
#   As a workaround, import interim releases first so that the number of
#   renames remains sane at each step. That makes reviewing the renames
#   performed by the script much easier.
#
#   Enjoy!
#
##############################################################################

import argparse
import cmd
import difflib
import filecmp
import os
import readline
import shlex
import shutil
import subprocess
import sys

def name_similarity(n1, n2):
    '''
    Function to be used as a key for sorting dirs/files by name matching
    '''
    sm = difflib.SequenceMatcher(a=n1, b=n2)
    return 1.0 - sm.ratio()


def filename_sort_key(s):
    '''
    Function to sort filenames so that parent directory is always followed
    by its children. Without it, [ "/a", "/a-b", "/a/b", "/a-b/c" ] would
    not be sorted correctly.
    '''
    return s.replace('/', '\001')


def descendant_or_self(path, ancestor):
    '''
    Check if path is somewhere in hierarchy under ancestor.
    '''
    return path == ancestor or path.startswith(ancestor + os.sep)

def path_rebase(path, old_base, new_base):
    '''
    Return a path name that has the same relative path to new_base as path
    had to old_base. Assumes path is a descendant of old_base.
    '''
    if path == old_base:
        return new_base
    return os.path.normpath(os.path.join(new_base,
        os.path.relpath(path, old_base)))


def for_all_parents(path, func):
    '''
    Invoke func for each parent path.
    '''
    d = os.path.dirname(path)
    while d != "":
        func(d)
        d = os.path.dirname(d)

class InvalidUsageException(Exception):
    '''
    Raised if command line arguments are invalid
    '''
    def __init__(self, cmd, msg):
        Exception.__init__(self, msg)
        self.cmd = cmd


class NotImplementedException(Exception):
    '''
    Raised if some code path is not implemented
    '''
    pass


# Indexes into FSO.state
S_WC = 0
S_IM = 1

class FSO(object):
    '''
    File system object (file/dir either in imported dir or in WC)
    '''
    def __init__(self):
        self.wc_path = None
        self.state = [ "-", "-" ] # '-': absent, 'F': file, 'D': dir

    def status(self):
        return "[%s%s]" % (self.state[S_WC], self.state[S_IM])

    def orig_reference(self, curpath):
        if self.wc_path and self.wc_path != curpath:
            return " (original: %s)" % shlex.quote(self.wc_path)
        return ""


class FSOCollection(dict):
    '''
    Collection of FSOs
    '''
    def print(self):
        print(" / Status in working copy (-:absent, F:file, D:dir)")
        print(" |/ Status in imported sources (-:absent, F:file, D:dir)")
        for k in sorted(self.keys(), key=filename_sort_key):
            e = self[k]
            print("%s %s%s" % (e.status(), shlex.quote(k),
                e.orig_reference(k)))

    def get(self, path):
        'Get existing FSO or create a new one'
        if path in self:
            return self[path]
        e = FSO()
        self[path] = e
        return e

    def add(self, path, where, kind):
        'Adding entries during initial scan'
        path = os.path.normpath(path)
        e = self.get(path)
        e.state[where] = kind
        if where == S_WC:
            e.wc_path = path

    def wc_copy(self, src, dst):
        'Handle move in a working copy'
        keys = list(self.keys())
        for k in keys:
            if descendant_or_self(k, src):
                esrc = self[k]
                if esrc.state[S_WC] == "-":
                    continue
                kn = path_rebase(k, src, dst)
                edst = self.get(kn)
                if edst.state[S_WC] != "-":
                    # Copying into existing destination.
                    # Caller should've checked this.
                    raise NotImplementedException
                edst.wc_path = esrc.wc_path
                edst.state[S_WC] = esrc.state[S_WC]

    def wc_remove(self, path):
        'Handle removal in a working copy'
        keys = list(self.keys())
        for k in keys:
            if descendant_or_self(k, path):
                self[k].state[S_WC] = "-"


class ConfigOpt(object):
    'Helper class - single option (string)'
    def __init__(self, value, helpmsg):
        self.value = value
        self.helpmsg = helpmsg

    def set(self, new_value):
        self.value = new_value

    def __str__(self):
        return "<none>" if self.value is None else "`%s'" % self.value


class ConfigOptInt(ConfigOpt):
    'Helper class - single option (integer)'
    def set(self, new_value):
        try:
            self.value = int(new_value)
        except ValueError:
            raise InvalidUsageException(None, "Value must be integer")

    def __str__(self):
        return "%d" % self.value


class Config(dict):
    '''
    Store configuration options.
    '''
    def add_option(self, name, cfgopt):
        self[name] = cfgopt

    def set(self, name, value):
        if name not in self:
            raise InvalidUsageException(None,
                    "Unknown config variable '%s'" % name)
        self[name].set(value)

    def get(self, name):
        if name not in self:
            raise NotImplementedException()
        return self[name].value

    def print(self):
        for k in sorted(self):
            o = self[k]
            print("# %s" % o.helpmsg)
            print("%-20s: %s" % (k, str(o)))
            print()


class SvnVndImport(cmd.Cmd):
    '''
    Main driving class.
    '''
    intro = "Welcome to SVN vendor import helper. " + \
            "Type help or ? to list commands.\n"
    prompt = "(svn-vendor) "
    prepare_ops = []

    def __init__(self, wcdir, importdir, svninfo):
        cmd.Cmd.__init__(self)
        self.wcdir = wcdir
        self.importdir = importdir
        self.svninfo = svninfo
        self.config = Config()
        self.config.add_option('save-diff-copied',
                ConfigOpt(None, "Save 'svn diff' output on the " +
                    "moved/copied files and directories to this " +
                    "file as part of 'apply'"))
        self.config.add_option('dir-similarity',
                ConfigOptInt(600, "Similarity between dirs to assume " +
                    "a copy/move [0..1000]"))
        self.config.add_option('file-similarity',
                ConfigOptInt(600, "Similarity between files to assume a " +
                    "copy/move [0..1000]"))
        self.config.add_option('file-min-lines',
                ConfigOptInt(10, "Minimal number of lines in a file for " +
                    "meaningful comparison"))
        self.config.add_option('verbose',
                ConfigOptInt(3, "Verbosity of the output [0..5]"))
        try:
            self.termwidth = os.get_terminal_size()[0]
        except OSError:
            # Not running in a terminal - probably redirected to file
            self.termwidth = 150 # arbitrary number

    def info(self, level, msg):
        'Print message with specified verbosity'
        if level <= self.config.get('verbose'):
            print(msg, flush=True)

    def scan(self):
        self.items = FSOCollection()
        self.info(1, "Scanning working copy directory...")
        self.get_lists(self.wcdir, S_WC)
        self.info(1, "Scanning imported directory...")
        self.get_lists(self.importdir, S_IM)

    def get_lists(self, top, where):
        for d, dn, fn in os.walk(top, followlinks=True):
            dr = os.path.relpath(d, top)
            # If under .svn directory at the top (SVN 1.7+) or has .svn
            # in the path (older SVN), ignore
            if descendant_or_self(dr, '.svn') or \
                    os.path.basename(dr) == '.svn' or \
                    (os.sep + '.svn' + os.sep) in dr:
                continue
            if dr != '.':
                self.items.add(dr, where, "D")
            for f in fn:
                fr = os.path.normpath(os.path.join(dr, f))
                self.items.add(fr, where, "F")

    def onecmd(self, str):
        'Override for checking number of arguments'
        try:
            return cmd.Cmd.onecmd(self, str)
        except InvalidUsageException as e:
            if e.cmd is not None:
                print("!!! Invalid usage of `%s' command: %s" % (e.cmd, e))
                print()
                self.onecmd("help " + e.cmd)
            else:
                print("!!! %s" % e)

    def parse_args(self, line, nargs, cmd):
        'Parse arguments for a command'
        args = shlex.split(line)
        if len(args) != nargs:
            raise InvalidUsageException(cmd, "expect %d arguments" % nargs)
        return args

    def run_svn(self, args_fixed, args_split=[]):
        'Run SVN command(s), potentially splitting long argument lists'
        rv = True
        pos = 0
        atatime = 100
        output = ""
        while pos < len(args_split) or (pos == 0 and len(args_split) == 0):
            svnargs = ['svn'] + args_fixed + args_split[pos : pos + atatime]
            pos += atatime
            self.info(5, "Running: " + " ".join(map(shlex.quote, svnargs)))
            p = subprocess.Popen(args=svnargs, stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE, cwd=self.wcdir)
            so, se = p.communicate()
            if p.returncode != 0:
                print("`%s' exited with %d status:" %
                        (" ".join(map(shlex.quote, svnargs)), p.returncode))
                print(se.decode())
                rv = False
            else:
                output += so.decode()
        return rv, output

    def copy_or_move(self, op, src, dst):
        'Handle copy or move operation'
        if src not in self.items or self.items[src].state[S_WC] == "-":
            raise InvalidUsageException(None,
                    "Nothing known about `%s'" % src)
        if dst in self.items and self.items[dst].state[S_WC] != "-":
            raise InvalidUsageException(None,
                    "Destination path `%s' already exists" % dst)
        # Check that we're not creating dst under a file (not a dir)
        new_dirs = []
        def check_parent(d):
            if d not in self.items or self.items[d].state[S_WC] == "-":
                new_dirs.append(d)
            elif self.items[d].state[S_WC] == "F":
                raise InvalidUsageException(None,
                        "Destination path `%s' created under `%s' " +
                        "which is a file" % (dst, d))
        for_all_parents(dst, check_parent)
        # All ok, record new directories that may be created
        for d in new_dirs:
            self.items.get(d).state[S_WC] = "D"
        # Record the operation and update the FSO collection
        self.prepare_ops.append((op, src, dst))
        self.items.wc_copy(src, dst)
        if op == "mv":
            self.items.wc_remove(src)

    def remove(self, path):
        if path not in self.items or self.items[path].state[S_WC] == "-":
            raise InvalidUsageException(None,
                    "Nothing known about `%s'" % path)
        self.prepare_ops.append(("rm", path))
        self.items.wc_remove(path)

    def similarity_file(self, src, dst, threshold, lst_removal):
        'Compare two files, return similarity ratio on 0..1000 scale'
        if self.items[src].state[S_WC] != "F":
            return 0
        # Source is in working copy
        fn1 = os.path.join(self.wcdir, self.items[src].wc_path)
        # Destination is in imported dir
        fn2 = os.path.join(self.importdir, dst)
        minlines = self.config.get('file-min-lines')
        try:
            f1 = open(fn1, 'r')
            l1 = f1.readlines()
            f1.close()
            if len(l1) < minlines:
                return 0
            f2 = open(fn2, 'r')
            l2 = f2.readlines()
            f2.close()
            if len(l2) < minlines:
                return 0
            sm = difflib.SequenceMatcher(a=l1, b=l2)
            return int(1000 * sm.quick_ratio())
        except UnicodeDecodeError:
            # Oops, file seems to be binary. Fall back to comparing whole
            # file contents.
            if filecmp.cmp(fn1, fn2, shallow=False):
                return 1000
            return 0

    def _similarity_dir(self, src, dst, get_file_similarity, lst_removal):
        'Iterate over FSOs, using callback to compare file entries'
        common = 0
        total = 0
        for xsrc in self.items:
            if xsrc.startswith(src + os.sep):
                esrc = self.items[xsrc]
                if esrc.state[S_WC] == "-":
                    # Source not in WC - ignore for similarity calculation
                    continue
                skip = False
                if lst_removal is not None:
                    for i in lst_removal:
                        if descendant_or_self(xsrc, i):
                            skip = True
                if skip:
                    # Moved to another place, do not consider in score
                    continue
                total += 1000
                xdst = path_rebase(xsrc, src, dst)
                if xdst not in self.items:
                    # Destination not in imported sources - non-similar item
                    continue
                edst = self.items[xdst]
                if edst.state[S_IM] == esrc.state[S_WC]:
                    if esrc.state[S_WC] == "D":
                        common += 1000
                    else:
                        common += get_file_similarity(xsrc, xdst)
        if total == 0:
            # No files/subdirs in source directory - avoid copying empty dirs
            return 0
        return 1000 * common / total

    def similarity_dir(self, src, dst, threshold, lst_removal):
        '''
        Compare two dirs recursively, return similarity ratio on
        0..1000 scale.
        '''
        common = 0
        total = 0
        # Quickly estimate upper boundary by comparing file names. Only
        # concern ourselves with files in source directory. I.e., if
        # files were added after the move in the destination directory,
        # it's ok. If most of the files from the source directory were
        # removed, the directory is not considered similar - instead,
        # file move detection would move files one by one.
        upper = self._similarity_dir(src, dst, lambda s, d: 1000, lst_removal)
        if upper <= threshold:
            # Even the best estimate is worse than current cut-off
            return 0
        # Okay, looks roughly similar. Now redo the above procedure, but also
        # compare the file content.
        return self._similarity_dir(src, dst,
                lambda s, d: self.similarity_file(s, d, 0, lst_removal),
                lst_removal)

    def similar(self, src, dst, threshold=0, lst_removal=None):
        'Compare two FSOs, source in WC and destination in imported dir'
        if src not in self.items:
            print("Source `%s' not in the working copy" % src)
            return
        xsrc = self.items[src]
        if xsrc.state[S_WC] == "-":
            print("Source `%s' not in the working copy" % src)
            return
        if dst not in self.items:
            print("Destination `%s' not in imported sources" % dst)
            return
        xdst = self.items[dst]
        if xdst.state[S_IM] == "-":
            print("Destination `%s' not in imported sources" % dst)
            return
        if xsrc.state[S_WC] != xdst.state[S_IM]:
            # Different kinds - definitely not the same object
            return 0
        if xsrc.state[S_WC] == "D":
            return self.similarity_dir(src, dst, threshold, lst_removal)
        else:
            return self.similarity_file(src, dst, threshold, lst_removal)

    def handle_op(self, op_tuple):
        'Handle one SVN operation, recorded as a tuple'
        def x_mv(src, dst):
            self.info(2, "  Move `%s' to `%s'" % (src, dst))
            self.copy_or_move("mv", src, dst)
        def x_cp(src, dst):
            self.info(2, "  Copy `%s' to `%s'" % (src, dst))
            self.copy_or_move("cp", src, dst)
        def x_rm(path):
            self.info(2, "  Remove `%s'" % path)
            self.remove(path)
        known_ops = {
                # key: (nargs, handler)
                'cp' : (3, x_cp),
                'mv' : (3, x_mv),
                'rm' : (2, x_rm),
                }
        if len(op_tuple) == 0:
            raise InvalidUsageException
        op = op_tuple[0]
        if op not in known_ops:
            return False
        nargs, func = known_ops[op]
        if nargs != len(op_tuple):
            return False
        func(*op_tuple[1:])
        return True

    def detect(self, thresholds):
        'Helper for finding copy/move destinations'
        ilst = []
        wlst = {}
        ilst_map = {}
        for p in self.items:
            e = self.items[p]
            if e.state[S_WC] != "-" and e.state[S_IM] == "-":
                wlst[p] = [] # wlst hash stores copy destinations
            elif e.state[S_WC] == "-" and e.state[S_IM] != "-":
                # ilst just lists destination paths as tuples with node kind
                ilst.append((e.state[S_IM], p))
        iteration = 0
        # Do not apply operations immediately - we'll need to post-process
        # them to account for files/dirs moved inside a moved parent dir.
        ops = []
        to_be_removed = []
        def get_renamed_name(path, rename_ops):
            '''
            Check if path was renamed/removed in the recorded operations,
            return new name.
            '''
            for op_tuple in rename_ops:
                # Since copies do not remove the source file, ignore them.
                # We push no 'rm' ops in this function
                if op_tuple[0] == "mv":
                    src = op_tuple[1]
                    dst = op_tuple[2]
                    if descendant_or_self(path, src):
                        path = path_rebase(path, src, dst)
            return path

        while len(wlst):
            iteration += 1
            self.info(2, ("Iteration %d: Possible sources: %d, " +
                    "possible destinations: %d") %
                    (iteration, len(wlst), len(ilst)))
            ndst = len(ilst)
            for idx, (nk, dst) in enumerate(sorted(ilst,
                    key=lambda s: filename_sort_key(s[1]))):
                class SkipDestFile(Exception):
                    pass
                # Check if moved as a part of a parent directory.
                def check_moved_parent(xdst):
                    if xdst in ilst_map:
                        src = path_rebase(dst, xdst, ilst_map[xdst])
                        # Did it exist in copied directory?
                        if src in self.items and \
                                self.items[src].state[S_WC] == nk:
                            sim = self.similar(src, dst, thresholds[nk],
                                    to_be_removed)
                            if sim > thresholds[nk]:
                                self.info(2, ("  [%04d/%04d] Skipping `%s' " +
                                        "(copied as part of `%s')") %
                                        (idx, ndst, dst, xdst))
                                raise SkipDestFile
                            # Copied, not similar - search for other sources
                            raise StopIteration
                try:
                    for_all_parents(dst, check_moved_parent)
                except SkipDestFile:
                    continue
                except StopIteration:
                    pass
                self.info(2, ("  [%04d/%04d] Looking for possible source " +
                        "for `%s'") % (idx, ndst, dst))
                bestsrc = None
                # Won't even consider those lower than threshold
                bestsim = thresholds[nk]
                for src in sorted(wlst.keys(),
                        key=lambda x: name_similarity(x, dst)):
                    sim = self.similar(src, dst, bestsim, to_be_removed)
                    if sim > bestsim:
                        self.info(3, "    [similarity %4d] %s" % (sim, src))
                        bestsim = sim
                        bestsrc = src
                    if bestsim == 1000:
                        # No chance we're finding anything better
                        break
                if bestsrc is not None:
                    wlst[bestsrc].append(dst)
                    ilst_map[dst] = bestsrc

            # Discovered all copies/moves, now record them.
            new_wlst = {}
            for src in sorted(wlst.keys(), key=filename_sort_key):
                dlist = wlst[src]
                if len(dlist) == 0:
                    continue
                if len(dlist) == 1:
                    ops.append(("mv", src, dlist[0]))
                    to_be_removed.append(src)
                else:
                    # We don't remove the source here, it will be done when
                    # the changes are applied (it will remove all the WC files
                    # not found in imported sources). Avoiding removal here
                    # simplifies operation sorting below, since we would not
                    # be concerned with source file/dir disappearing before
                    # it is copied to its destination.
                    to_be_removed.append(src)
                    for d in dlist:
                        ops.append(("cp", src, d))
                # If we copied something - recheck parent source directories.
                # Since some source file/dir was scheduled to be removed,
                # this may have increased the similarity to some destination.
                def recheck_parent(x):
                    if x in wlst and len(wlst) == 0:
                        new_wlst[x] = []
                for_all_parents(src, recheck_parent)

            # At this point, if we're going to have the next iteration, we
            # are only concerned about directories (by the way new_wlst is
            # created above). So, filter out all files from ilst as well.
            wlst = new_wlst
            ilst = list(filter(lambda t: t[0] == 'D', ilst))

        # Finished collecting the operations - now can post-process and
        # apply them. First, sort copies/moves by destination (so that
        # parent directories are created before files/subdirs are
        # copied/renamed inside)
        ops = sorted(ops, key=lambda op: filename_sort_key(op[2]))
        for i, op_tuple in enumerate(ops):
            # For each operation, go over its precedents to see if the source
            # has been renamed. If it is, find out new name.
            op = op_tuple[0]
            src = get_renamed_name(op_tuple[1], reversed(ops[:i]))
            if src != op_tuple[2]:
                # Unless it became the same file after renames
                try:
                    # Try to remove the destination, if it existed
                    self.remove(op_tuple[2])
                except InvalidUsageException:
                    # Okay, it didn't exist
                    pass
                self.handle_op((op, src, op_tuple[2]))

    def do_detect(self, arg):
        '''
        detect : auto-detect possible moves (where source/destination name
                 is unique). If not all moves are applicable, save move list,
                 edit and load.
        '''
        self.parse_args(arg, 0, "detect")
        self.detect({ "D": self.config.get('dir-similarity'),
            "F": self.config.get('file-similarity')})

    def do_apply(self, arg):
        '''
        apply : Perform copies/renames; then copy imported sources into
                the working copy. Modifies working copy. Exits after
                completion.
        '''
        self.info(1, "Copying imported sources into working copy...")
        # Perform the recorded copies/moves/removals
        self.info(2, "  Preparatory operations (copies/renames/removals)")
        to_be_diffed = []
        for o in self.prepare_ops:
            op = o[0]
            if op == "mv":
                self.run_svn(["mv", "--parents", o[1], o[2]])
                to_be_diffed.append(o[2])
            elif op == "cp":
                self.run_svn(["cp", "--parents", o[1], o[2]])
                to_be_diffed.append(o[2])
            elif op == "rm":
                # --force, as the removed path is likely created as a result
                # of previous copy/rename
                self.run_svn(["rm", "--force", o[1]])
        dirs_added = []
        dirs_removed = []
        files_added = []
        files_removed = []
        self.info(2, "  Creating dirs and copying files...")
        for i in sorted(self.items.keys()):
            e = self.items[i]
            nk_wc = e.state[S_WC]
            nk_im = e.state[S_IM]
            flg = None
            if nk_wc == "-":
                # Absent in working copy
                if nk_im == "D":
                    # Directory added
                    os.mkdir(os.path.join(self.wcdir, i))
                    dirs_added.append(i)
                    flg = "(added dir)"
                elif nk_im == "F":
                    # New file added
                    shutil.copyfile(os.path.join(self.importdir, i),
                            os.path.join(self.wcdir, i))
                    files_added.append(i)
                    flg = "(added file)"
                else:
                    # Not in imported sources, not in WC (moved
                    # away/removed) - nothing to do
                    pass
            elif nk_wc == "F":
                # File in a working copy
                if nk_im == "D":
                    # File replaced with a directory. See comment above.
                    self.run_svn(["rm", "--force", i])
                    os.mkdir(os.path.join(self.wcdir, i))
                    dirs_added.append(i)
                    flg = "(replaced file with dir)"
                elif nk_im == "F":
                    # Was a file, is a file - just copy contents
                    shutil.copyfile(os.path.join(self.importdir, i),
                            os.path.join(self.wcdir, i))
                    flg = "(copied)"
                else:
                    # Was a file, removed
                    files_removed.append(i)
                    flg = "(removed file)"
            elif nk_wc == "D":
                # Directory in a working copy
                if nk_im == "D":
                    # Was a directory, is a directory - nothing to do
                    pass
                elif nk_im == "F":
                    # Directory replaced with file. Need to remove dir
                    # immediately, as bulk removals/additions assume new files
                    # and dirs already in place.
                    self.run_svn(["rm", "--force", i])
                    shutil.copyfile(os.path.join(self.importdir, i),
                            os.path.join(self.wcdir, i))
                    files_added.append(i)
                    flg = "(replaced dir with file)"
                else:
                    # Directory removed
                    dirs_removed.append(i)
                    flg = "(removed dir)"
            if flg is not None:
                self.info(4, "    %s %s %s" % (e.status(), i, flg))
        # Filter files/directories removed as a part of parent directory
        files_removed = list(filter(lambda x: os.path.dirname(x) not in
                dirs_removed, files_removed))
        dirs_removed = list(filter(lambda x: os.path.dirname(x) not in
            dirs_removed, dirs_removed))
        files_added = list(filter(lambda x: os.path.dirname(x) not in
            dirs_added, files_added))
        dirs_added = list(filter(lambda x: os.path.dirname(x) not in
            dirs_added, dirs_added))
        self.info(2, "  Running SVN add/rm commands");
        if len(dirs_added):
            self.run_svn(["add"], dirs_added)
        if len(files_added):
            self.run_svn(["add"], files_added)
        if len(dirs_removed):
            self.run_svn(["rm"], dirs_removed)
        if len(files_removed):
            self.run_svn(["rm"], files_removed)
        # Save the diff for the copied/moved items
        diff_save = self.config.get('save-diff-copied')
        if diff_save is not None:
            self.info(2, "  Saving 'svn diff' on copied files/dirs to `%s'" %
                    diff_save)
            to_be_diffed = list(filter(lambda x: os.path.dirname(x) not in
                to_be_diffed, to_be_diffed))
            if len(to_be_diffed):
                try:
                    rv, out = self.run_svn(["diff"], to_be_diffed)
                except UnicodeDecodeError:
                    # Some binary files not marked with appropriate MIME type,
                    # or broken text files
                    rv, out = (True, "WARNING: diff contained binary files\n")
            else:
                rv, out = (True, "")
            if rv:
                f = open(diff_save, "w")
                f.write(out)
                f.close()
        # Exiting, as the resulting working copy can no longer be used
        # for move analysis
        self.info(1, "Done. Exiting; please examine the working copy " +
                "and commit.")
        return True

    def do_similarity(self, arg):
        '''
        similarity SRD DST : estimate whether SRC could be potential source
                             for DST (0=no match, 1000=perfect match)
        '''
        src, dst = self.parse_args(arg, 2, "similarity")
        sim = self.similar(src, dst)
        if sim is not None:
            print("Similarity between source `%s' and destination `%s': %4d" %
                    (src, dst, sim))

    def do_set(self, arg):
        '''
        set         : display current settings
        set CFG VAL : set a config variable
        '''
        if arg.strip() == '':
            self.config.print()
        else:
            cfg, val = self.parse_args(arg, 2, "set")
            self.config.set(cfg, val)

    def do_move(self, arg):
        '''
        move SRC DST : Perform a move from source to destination
        '''
        src, dst = self.parse_args(arg, 2, "move")
        self.copy_or_move("mv", src, dst)

    def do_copy(self, arg):
        '''
        copy SRC DST : Perform a copy from source to destination
        '''
        src, dst = self.parse_args(arg, 2, "copy")
        self.copy_or_move("cp", src, dst)

    def do_remove(self, arg):
        '''
        remove PATH : Remove a path
        '''
        path = self.parse_args(arg, 1, "remove")[0]
        self.copy_or_move("rm", path)

    def do_lsprep(self, arg):
        '''
        lsprep : List the currently recorded moves/copies/removals
        '''
        self.parse_args(arg, 0, "lsprep")
        colsz = int((self.termwidth - 14) / 2)
        if len(self.prepare_ops):
            print("Currently recorded preparatory operations:")
            print()
            print("%5s  %s  %-*s  %-*s" %
                    ("#", "Op", colsz, "Source", colsz, "Destination"))
            for id, o in enumerate(self.prepare_ops):
                if id % 10 == 0:
                    print("%5s  %s  %*s  %*s" %
                            ("-"*5, "--", colsz, "-"*colsz, colsz, "-"*colsz))
                if len(o) == 3:
                    print("%5d  %s  %-*s  %-*s" %
                            (id, o[0], colsz, o[1], colsz, o[2]))
                else:
                    print("%5d  %s  %-*s" % (id, o[0], colsz, o[1]))
            print()
        else:
            print("No copies/moves/removals recorded")
            print()

    def do_save(self, arg):
        '''
        save FILENAME : Save current preparation operations to a file
        '''
        fn = self.parse_args(arg, 1, "save")[0]
        f = open(fn, 'w')
        longestname = 0
        for o in self.prepare_ops:
            if len(o[1]) > longestname:
                longestname = len(o[1])
            if len(o) == 3 and len(o[2]) > longestname:
                longestname = len(o[2])
        for o in self.prepare_ops:
            if len(o) == 2:
                f.write("svn %s %-*s\n" %
                        (o[0], longestname, shlex.quote(o[1])))
            else:
                f.write("svn %s %-*s %-*s\n" %
                        (o[0], longestname, shlex.quote(o[1]),
                            longestname, shlex.quote(o[2])))
            pass
        f.close()

    def do_load(self, arg):
        '''
        load FILENAME : Load/append preparation operations from a file
        '''
        fn = self.parse_args(arg, 1, "load")[0]
        self.info(1, "Performing operations from `%s'" % fn)
        f = open(fn, 'r')
        for l in f.readlines():
            if l[0] == '#':
                continue
            args = shlex.split(l)
            try:
                if len(args) < 2 or args[0] != 'svn':
                    raise InvalidUsageException(None, "")
                self.handle_op(args[1:])
            except InvalidUsageException as e:
                # Rethrow
                raise InvalidUsageException(None,
                        "Invalid line in file: %s(%s)" % (l, e))
        f.close()

    def do_svninfo(self, arg):
        '''
        svninfo : Display SVN info on the working copy (debug)
        '''
        self.parse_args(arg, 0, "svninfo")
        print(str(self.svninfo))

    def do_printlst(self, arg):
        '''
        printlst WHAT : Print list of files; WHAT is one of {dir,file} (debug)
        '''
        self.parse_args(arg, 0, "printlst")
        self.items.print()

    def do_help(self, arg):
        '''
        help [COMMAND] : Print the help message
        '''
        cmd.Cmd.do_help(self, arg)

    def do_EOF(self, arg):
        '''
        Quit the script
        '''
        return True

    def do_quit(self, arg):
        '''
        quit : Quit the script
        '''
        return True


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
            description="Prepare a working copy for SVN vendor import.")
    parser.add_argument('wcdir',
            help="Path to working copy (destination of import)")
    parser.add_argument('importdir',
            help="Path to imported sources (source of import)")
    grp = parser.add_mutually_exclusive_group()
    grp.add_argument('--auto', action='store_true',
            help="Automatic mode: detect moves, apply them and copy sources")
    grp.add_argument('--detect', metavar='FILE',
            help="Semi-automatic mode: detect moves and save them to FILE")
    grp.add_argument('--apply', metavar='FILE',
            help="Semi-automatic mode: apply the moves from FILE " +
            "and copy the sources")
    parser.add_argument('--save', metavar='FILE',
            help="Automatic mode: save moves to FILE after detection, " +
            "then proceed to apply the changes")
    parser.add_argument('--config', metavar=('OPT','VALUE'), action='append',
            nargs=2, help="Set configuration option OPT to VALUE")
    args = parser.parse_args()
    p = subprocess.Popen(args=['svn', 'info', args.wcdir],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    so, se = p.communicate()
    if p.returncode != 0:
        print("%s: does not appear to be SVN working copy." % args.wcdir)
        print("`svn info' exited with status %d and returned:" % p.returncode)
        print()
        print(se.decode())
        sys.exit(1)
    imp = SvnVndImport(args.wcdir, args.importdir, so.decode())
    if args.config:
        try:
            for o, v in args.config:
                imp.config.set(o, v)
        except InvalidUsageException as e:
            parser.error(e)
    imp.scan()
    if args.auto:
        imp.onecmd("detect")
        if args.save:
            imp.onecmd("save " + shlex.quote(args.save))
        imp.onecmd("apply")
    elif args.detect:
        imp.onecmd("detect")
        imp.onecmd("save " + shlex.quote(args.detect))
    elif args.apply:
        imp.onecmd("load " + shlex.quote(args.apply))
        imp.onecmd("apply")
    else:
        imp.cmdloop()