MimeDel.py   [plain text]


# Copyright (C) 2002-2009 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.

"""MIME-stripping filter for Mailman.

This module scans a message for MIME content, removing those sections whose
MIME types match one of a list of matches.  multipart/alternative sections are
replaced by the first non-empty component, and multipart/mixed sections
wrapping only single sections after other processing are replaced by their
contents.
"""

import os
import errno
import tempfile
from os.path import splitext

from email.Iterators import typed_subpart_iterator

from Mailman import mm_cfg
from Mailman import Errors
from Mailman.Message import UserNotification
from Mailman.Queue.sbcache import get_switchboard
from Mailman.Logging.Syslog import syslog
from Mailman.Version import VERSION
from Mailman.i18n import _
from Mailman.Utils import oneline



def process(mlist, msg, msgdata):
    # Short-circuits
    if not mlist.filter_content:
        return
    if msgdata.get('isdigest'):
        return
    # We also don't care about our own digests or plaintext
    ctype = msg.get_content_type()
    mtype = msg.get_content_maintype()
    # Check to see if the outer type matches one of the filter types
    filtertypes = mlist.filter_mime_types
    passtypes = mlist.pass_mime_types
    if ctype in filtertypes or mtype in filtertypes:
        dispose(mlist, msg, msgdata,
                _("The message's content type was explicitly disallowed"))
    # Check to see if there is a pass types and the outer type doesn't match
    # one of these types
    if passtypes and not (ctype in passtypes or mtype in passtypes):
        dispose(mlist, msg, msgdata,
                _("The message's content type was not explicitly allowed"))
    # Filter by file extensions
    filterexts = mlist.filter_filename_extensions
    passexts = mlist.pass_filename_extensions
    fext = get_file_ext(msg)
    if fext:
        if fext in filterexts:
            dispose(mlist, msg, msgdata,
                 _("The message's file extension was explicitly disallowed"))
        if passexts and not (fext in passexts):
            dispose(mlist, msg, msgdata,
                 _("The message's file extension was not explicitly allowed"))
    numparts = len([subpart for subpart in msg.walk()])
    # If the message is a multipart, filter out matching subparts
    if msg.is_multipart():
        # Recursively filter out any subparts that match the filter list
        prelen = len(msg.get_payload())
        filter_parts(msg, filtertypes, passtypes, filterexts, passexts)
        # If the outer message is now an empty multipart (and it wasn't
        # before!) then, again it gets discarded.
        postlen = len(msg.get_payload())
        if postlen == 0 and prelen > 0:
            dispose(mlist, msg, msgdata,
                    _("After content filtering, the message was empty"))
    # Now replace all multipart/alternatives with just the first non-empty
    # alternative.  BAW: We have to special case when the outer part is a
    # multipart/alternative because we need to retain most of the outer part's
    # headers.  For now we'll move the subpart's payload into the outer part,
    # and then copy over its Content-Type: and Content-Transfer-Encoding:
    # headers (any others?).
    if mlist.collapse_alternatives:
        collapse_multipart_alternatives(msg)
        if ctype == 'multipart/alternative':
            firstalt = msg.get_payload(0)
            reset_payload(msg, firstalt)
    # If we removed some parts, make note of this
    changedp = 0
    if numparts <> len([subpart for subpart in msg.walk()]):
        changedp = 1
    # Now perhaps convert all text/html to text/plain
    if mlist.convert_html_to_plaintext and mm_cfg.HTML_TO_PLAIN_TEXT_COMMAND:
        changedp += to_plaintext(msg)
    # If we're left with only two parts, an empty body and one attachment,
    # recast the message to one of just that part
    if msg.is_multipart() and len(msg.get_payload()) == 2:
        if msg.get_payload(0).get_payload() == '':
            useful = msg.get_payload(1)
            reset_payload(msg, useful)
            changedp = 1
    if changedp:
        msg['X-Content-Filtered-By'] = 'Mailman/MimeDel %s' % VERSION



def reset_payload(msg, subpart):
    # Reset payload of msg to contents of subpart, and fix up content headers
    payload = subpart.get_payload()
    msg.set_payload(payload)
    del msg['content-type']
    del msg['content-transfer-encoding']
    del msg['content-disposition']
    del msg['content-description']
    msg['Content-Type'] = subpart.get('content-type', 'text/plain')
    cte = subpart.get('content-transfer-encoding')
    if cte:
        msg['Content-Transfer-Encoding'] = cte
    cdisp = subpart.get('content-disposition')
    if cdisp:
        msg['Content-Disposition'] = cdisp
    cdesc = subpart.get('content-description')
    if cdesc:
        msg['Content-Description'] = cdesc



def filter_parts(msg, filtertypes, passtypes, filterexts, passexts):
    # Look at all the message's subparts, and recursively filter
    if not msg.is_multipart():
        return 1
    payload = msg.get_payload()
    prelen = len(payload)
    newpayload = []
    for subpart in payload:
        keep = filter_parts(subpart, filtertypes, passtypes,
                            filterexts, passexts)
        if not keep:
            continue
        ctype = subpart.get_content_type()
        mtype = subpart.get_content_maintype()
        if ctype in filtertypes or mtype in filtertypes:
            # Throw this subpart away
            continue
        if passtypes and not (ctype in passtypes or mtype in passtypes):
            # Throw this subpart away
            continue
        # check file extension
        fext = get_file_ext(subpart)
        if fext:
            if fext in filterexts:
                continue
            if passexts and not (fext in passexts):
                continue
        newpayload.append(subpart)
    # Check to see if we discarded all the subparts
    postlen = len(newpayload)
    msg.set_payload(newpayload)
    if postlen == 0 and prelen > 0:
        # We threw away everything
        return 0
    return 1



def collapse_multipart_alternatives(msg):
    if not msg.is_multipart():
        return
    newpayload = []
    for subpart in msg.get_payload():
        if subpart.get_content_type() == 'multipart/alternative':
            try:
                firstalt = subpart.get_payload(0)
                newpayload.append(firstalt)
            except (IndexError, TypeError):
                pass
        else:
            newpayload.append(subpart)
    msg.set_payload(newpayload)



def to_plaintext(msg):
    changedp = 0
    for subpart in typed_subpart_iterator(msg, 'text', 'html'):
        filename = tempfile.mktemp('.html')
        fp = open(filename, 'w')
        try:
            fp.write(subpart.get_payload(decode=1))
            fp.close()
            cmd = os.popen(mm_cfg.HTML_TO_PLAIN_TEXT_COMMAND %
                           {'filename': filename})
            plaintext = cmd.read()
            rtn = cmd.close()
            if rtn:
                syslog('error', 'HTML->text/plain error: %s', rtn)
        finally:
            try:
                os.unlink(filename)
            except OSError, e:
                if e.errno <> errno.ENOENT: raise
        # Now replace the payload of the subpart and twiddle the Content-Type:
        del subpart['content-transfer-encoding']
        subpart.set_payload(plaintext)
        subpart.set_type('text/plain')
        changedp = 1
    return changedp



def dispose(mlist, msg, msgdata, why):
    # filter_action == 0 just discards, see below
    if mlist.filter_action == 1:
        # Bounce the message to the original author
        raise Errors.RejectMessage, why
    if mlist.filter_action == 2:
        # Forward it on to the list owner
        listname = mlist.internal_name()
        mlist.ForwardMessage(
            msg,
            text=_("""\
The attached message matched the %(listname)s mailing list's content filtering
rules and was prevented from being forwarded on to the list membership.  You
are receiving the only remaining copy of the discarded message.

"""),
            subject=_('Content filtered message notification'))
    if mlist.filter_action == 3 and \
           mm_cfg.OWNERS_CAN_PRESERVE_FILTERED_MESSAGES:
        badq = get_switchboard(mm_cfg.BADQUEUE_DIR)
        badq.enqueue(msg, msgdata)
    # Most cases also discard the message
    raise Errors.DiscardMessage

def get_file_ext(m):
    """
    Get filename extension. Caution: some virus don't put filename
    in 'Content-Disposition' header.
"""
    fext = ''
    filename = m.get_filename('') or m.get_param('name', '')
    if filename:
        fext = splitext(oneline(filename,'utf-8'))[1]
        if len(fext) > 1:
            fext = fext[1:]
        else:
            fext = ''
    return fext.lower()