cleanarch   [plain text]


#! @PYTHON@

# Copyright (C) 2001-2006 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.

"""Clean up an .mbox archive file.

The archiver looks for Unix-From lines separating messages in an mbox archive
file.  For compatibility, it specifically looks for lines that start with
"From " -- i.e. the letters capital-F, lowercase-r, o, m, space, ignoring
everything else on the line.

Normally, any lines that start "From " in the body of a message should be
escaped such that a > character is actually the first on a line.  It is
possible though that body lines are not actually escaped.  This script
attempts to fix these by doing a stricter test of the Unix-From lines.  Any
lines that start "From " but do not pass this stricter test are escaped with a
> character.

Usage: cleanarch [options] < inputfile > outputfile
Options:
    -s n
    --status=n
        Print a # character every n lines processed

    -q / --quiet
        Don't print changed line information to standard error.

    -n / --dry-run
        Don't actually output anything.

    -h / --help
        Print this message and exit
"""

import re
import sys
import getopt
import mailbox

import paths
from Mailman.i18n import _

cre = re.compile(mailbox.UnixMailbox._fromlinepattern)

# From RFC 2822, a header field name must contain only characters from 33-126
# inclusive, excluding colon.  I.e. from oct 41 to oct 176 less oct 072.  Must
# use re.match() so that it's anchored at the beginning of the line.
fre = re.compile(r'[\041-\071\073-\176]+')



def usage(code, msg=''):
    if code:
        fd = sys.stderr
    else:
        fd = sys.stdout
    print >> fd, _(__doc__)
    if msg:
        print >> fd, msg
    sys.exit(code)



def escape_line(line, lineno, quiet, output):
    if output:
        sys.stdout.write('>' + line)
    if not quiet:
        print >> sys.stderr, _('Unix-From line changed: %(lineno)d')
        print >> sys.stderr, line[:-1]



def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], 'hqns:',
            ['help', 'quiet', 'dry-run', 'status='])
    except getopt.error, msg:
        usage(1, msg)

    quiet = False
    output = True
    status = -1

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-q', '--quiet'):
            quiet = True
        elif opt in ('-n', '--dry-run'):
            output = False
        elif opt in ('-s', '--status'):
            try:
                status = int(arg)
            except ValueError:
                usage(1, _('Bad status number: %(arg)s'))

    if args:
        usage(1)

    lineno = 0
    statuscnt = 0
    messages = 0
    prevline = None
    while True:
        lineno += 1
        line = sys.stdin.readline()
        if not line:
            break
        if line.startswith('From '):
            if cre.match(line):
                # This is a real Unix-From line.  But it could be a message
                # /about/ Unix-From lines, so as a second order test, make
                # sure there's at least one RFC 2822 header following
                nextline = sys.stdin.readline()
                lineno += 1
                if not nextline:
                    # It was the last line of the mbox, so it couldn't have
                    # been a Unix-From
                    escape_line(line, lineno, quiet, output)
                    break
                fieldname = nextline.split(':', 1)
                if len(fieldname) < 2 or not fre.match(nextline):
                    # The following line was not a header, so this wasn't a
                    # valid Unix-From
                    escape_line(line, lineno, quiet, output)
                    if output:
                        sys.stdout.write(nextline)
                else:
                    # It's a valid Unix-From line
                    messages += 1
                    if output:
                        # Before we spit out the From_ line, make sure the
                        # previous line was blank.
                        if prevline is not None and prevline <> '\n':
                            sys.stdout.write('\n')
                        sys.stdout.write(line)
                        sys.stdout.write(nextline)
            else:
                # This is a bogus Unix-From line
                escape_line(line, lineno, quiet, output)
        elif output:
            # Any old line
            sys.stdout.write(line)
        if status > 0 and (lineno % status) == 0:
            sys.stderr.write('#')
            statuscnt += 1
            if statuscnt > 50:
                print >> sys.stderr
                statuscnt = 0
        prevline = line
    print >> sys.stderr, _('%(messages)d messages found')



if __name__ == '__main__':
    main()