from __future__ import nested_scopes
import mailbox
import os
import re
import sys
import time
from email.Utils import parseaddr, parsedate_tz, mktime_tz, formatdate
import cPickle as pickle
from cStringIO import StringIO
from string import lowercase
__version__ = '0.09 (Mailman edition)'
VERSION = __version__
CACHESIZE = 100
from Mailman import Errors
from Mailman.Mailbox import ArchiverMailbox
from Mailman.Logging.Syslog import syslog
from Mailman.i18n import _
try:
True, False
except NameError:
True = 1
False = 0
SPACE = ' '
msgid_pat = re.compile(r'(<.*>)')
def strip_separators(s):
"Remove quotes or parenthesization from a Message-ID string"
if not s:
return ""
if s[0] in '"<([' and s[-1] in '">)]':
s = s[1:-1]
return s
smallNameParts = ['van', 'von', 'der', 'de']
def fixAuthor(author):
"Canonicalize a name into Last, First format"
if ',' in author:
return author
L = author.split()
i = len(L) - 1
if i == 0:
return author if author.upper() == author or author.lower() == author:
while i > 0 and L[i-1].lower() in smallNameParts:
i = i - 1
else:
while i>0 and (L[i-1][0] in lowercase or
L[i-1].lower() in smallNameParts):
i = i - 1
author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i])
return author
class DatabaseInterface:
def __init__(self): pass
def close(self): pass
def getArticle(self, archive, msgid): pass
def hasArticle(self, archive, msgid): pass
def addArticle(self, archive, article, subject=None, author=None,
date=None): pass
def firstdate(self, archive): pass
def lastdate(self, archive): pass
def first(self, archive, index): pass
def next(self, archive, index): pass
def numArticles(self, archive): pass
def newArchive(self, archive): pass
def setThreadKey(self, archive, key, msgid): pass
def getOldestArticle(self, subject): pass
class Database(DatabaseInterface):
"""Define the basic sorting logic for a database
Assumes that the database internally uses dateIndex, authorIndex,
etc.
"""
def __init__(self):
self.dateIndex = {}
self.authorIndex = {}
self.subjectIndex = {}
self.articleIndex = {}
self.changed = {}
def addArticle(self, archive, article, subject=None, author=None,
date=None):
authorkey = (author or article.author, article.date,
article.msgid)
subjectkey = (subject or article.subject, article.date,
article.msgid)
datekey = date or article.date, article.msgid
self.dateIndex[datekey] = article.msgid
self.authorIndex[authorkey] = article.msgid
self.subjectIndex[subjectkey] = article.msgid
self.store_article(article)
self.changed[archive, article.msgid] = None
parentID = article.parentID
if parentID is not None and self.articleIndex.has_key(parentID):
parent = self.getArticle(archive, parentID)
myThreadKey = (parent.threadKey + article.date + '.'
+ str(article.sequence) + '-')
else:
myThreadKey = article.date + '.' + str(article.sequence) + '-'
article.threadKey = myThreadKey
key = myThreadKey, article.msgid
self.setThreadKey(archive, key, article.msgid)
def store_article(self, article):
"""Store article without message body to save space"""
temp = article.body
temp2 = article.html_body
article.body = []
del article.html_body
self.articleIndex[article.msgid] = pickle.dumps(article)
article.body = temp
article.html_body = temp2
class Article:
_last_article_time = time.time()
def __init__(self, message = None, sequence = 0, keepHeaders = []):
if message is None:
return
self.sequence = sequence
self.parentID = None
self.threadKey = None
id = strip_separators(message['Message-Id'])
if id == "":
self.msgid = str(self.sequence)
else: self.msgid = id
if message.has_key('Subject'):
self.subject = str(message['Subject'])
else:
self.subject = _('No subject')
if self.subject == "": self.subject = _('No subject')
self._set_date(message)
self.author, self.email = parseaddr(message.get('From', ''))
e = message['Reply-To']
if not self.email and e is not None:
ignoreauthor, self.email = parseaddr(e)
self.email = strip_separators(self.email)
self.author = strip_separators(self.author)
if self.author == "":
self.author = self.email
self._in_reply_to = message['in-reply-to']
self._references = message['references']
self._message_id = message['message-id']
i_r_t = message['In-Reply-To']
if i_r_t is None:
self.in_reply_to = ''
else:
match = msgid_pat.search(i_r_t)
if match is None: self.in_reply_to = ''
else: self.in_reply_to = strip_separators(match.group(1))
references = message['References']
if references is None:
self.references = []
else:
self.references = map(strip_separators, references.split())
self.headers = {}
for i in keepHeaders:
if message.has_key(i):
self.headers[i] = message[i]
s = StringIO(message.get_payload(decode=True)\
or message.as_string().split('\n\n',1)[1])
self.body = s.readlines()
def _set_date(self, message):
def floatdate(header):
missing = []
datestr = message.get(header, missing)
if datestr is missing:
return None
date = parsedate_tz(datestr)
try:
return mktime_tz(date)
except (TypeError, ValueError, OverflowError):
return None
date = floatdate('date')
if date is None:
date = floatdate('x-list-received-date')
if date is None:
date = self._last_article_time + 1
self._last_article_time = date
self.date = '%011i' % date
self.datestr = message.get('date') \
or message.get('x-list-received-date') \
or formatdate(date)
def __repr__(self):
return '<Article ID = '+repr(self.msgid)+'>'
def finished_update_article(self):
pass
class T:
DIRMODE = 0755 FILEMODE = 0644 INDEX_EXT = ".html"
def __init__(self, basedir = None, reload = 1, database = None):
if basedir is None:
self.basedir = os.getcwd()
else:
basedir = os.path.expanduser(basedir)
self.basedir = basedir
self.database = database
try:
os.stat(self.basedir)
except os.error, errdata:
errno, errmsg = errdata
if errno != 2:
raise os.error, errdata
else:
self.message(_('Creating archive directory ') + self.basedir)
omask = os.umask(0)
try:
os.mkdir(self.basedir, self.DIRMODE)
finally:
os.umask(omask)
try:
if not reload:
raise IOError
f = open(os.path.join(self.basedir, 'pipermail.pck'), 'r')
self.message(_('Reloading pickled archive state'))
d = pickle.load(f)
f.close()
for key, value in d.items():
setattr(self, key, value)
except (IOError, EOFError):
self.archives = [] self._dirty_archives = [] self.sequence = 0 self.update_TOC = 0 if basedir != self.basedir:
self.basedir = basedir
def close(self):
"Close an archive, save its state, and update any changed archives."
self.update_dirty_archives()
self.update_TOC = 0
self.write_TOC()
self.message(_('Pickling archive state into ')
+ os.path.join(self.basedir, 'pipermail.pck'))
self.database.close()
del self.database
omask = os.umask(007)
try:
f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
finally:
os.umask(omask)
pickle.dump(self.getstate(), f)
f.close()
def getstate(self):
return self.__dict__
def __set_parameters(self, archive):
firstdate = self.database.firstdate(archive)
lastdate = self.database.lastdate(archive)
now = time.asctime(time.localtime(time.time()))
self.firstdate = firstdate
self.lastdate = lastdate
self.archivedate = now
self.size = self.database.numArticles(archive)
self.archive = archive
self.version = __version__
def __findParent(self, article, children = []):
parentID = None
if article.in_reply_to:
parentID = article.in_reply_to
elif article.references:
refs = filter(self.articleIndex.has_key, article.references)
if not refs:
return None
maxdate = self.database.getArticle(self.archive,
refs[0])
for ref in refs[1:]:
a = self.database.getArticle(self.archive, ref)
if a.date > maxdate.date:
maxdate = a
parentID = maxdate.msgid
else:
try:
key, tempid = \
self.subjectIndex.set_location(article.subject)
print key, tempid
self.subjectIndex.next()
[subject, date] = key.split('\0')
print article.subject, subject, date
if subject == article.subject and tempid not in children:
parentID = tempid
except KeyError:
pass
return parentID
def updateThreadedIndex(self):
self.database.clearIndex(self.archive, 'thread')
msgid = self.database.first(self.archive, 'date')
while msgid is not None:
try:
article = self.database.getArticle(self.archive, msgid)
except KeyError:
pass
else:
if article.parentID is None or \
not self.database.hasArticle(self.archive,
article.parentID):
pass
else:
parent = self.database.getArticle(self.archive,
article.parentID)
article.threadKey = (parent.threadKey + article.date + '.'
+ str(article.sequence) + '-')
self.database.setThreadKey(self.archive,
(article.threadKey, article.msgid),
msgid)
msgid = self.database.next(self.archive, 'date')
def update_archive(self, archive):
self.archive = archive
self.message(_("Updating index files for archive [%(archive)s]"))
arcdir = os.path.join(self.basedir, archive)
self.__set_parameters(archive)
for hdr in ('Date', 'Subject', 'Author'):
self._update_simple_index(hdr, archive, arcdir)
self._update_thread_index(archive, arcdir)
def _update_simple_index(self, hdr, archive, arcdir):
self.message(" " + hdr)
self.type = hdr
hdr = hdr.lower()
self._open_index_file_as_stdout(arcdir, hdr)
self.write_index_header()
count = 0
msgid = self.database.first(archive, hdr)
while msgid is not None:
try:
article = self.database.getArticle(self.archive, msgid)
except KeyError:
pass
else:
count = count + 1
self.write_index_entry(article)
msgid = self.database.next(archive, hdr)
self.write_index_footer()
self._restore_stdout()
def _update_thread_index(self, archive, arcdir):
self.message(_(" Thread"))
self._open_index_file_as_stdout(arcdir, "thread")
self.type = 'Thread'
self.write_index_header()
L = [None] * 5
i = 2
msgid = self.database.first(self.archive, 'thread')
while msgid is not None and i < 5:
L[i] = self.database.getArticle(self.archive, msgid)
i = i + 1
msgid = self.database.next(self.archive, 'thread')
while L[2] is not None:
article = L[2]
artkey = None
if article is not None:
artkey = article.threadKey
if artkey is not None:
self.write_threadindex_entry(article, artkey.count('-') - 1)
if self.database.changed.has_key((archive,article.msgid)):
a1 = L[1]
a3 = L[3]
self.update_article(arcdir, article, a1, a3)
if a3 is not None:
self.database.changed[(archive, a3.msgid)] = None
if a1 is not None:
key = archive, a1.msgid
if not self.database.changed.has_key(key):
self.update_article(arcdir, a1, L[0], L[2])
else:
del self.database.changed[key]
if L[0]:
L[0].finished_update_article()
L = L[1:] if msgid is None:
L.append(msgid)
else:
L.append(self.database.getArticle(self.archive, msgid))
msgid = self.database.next(self.archive, 'thread')
self.write_index_footer()
self._restore_stdout()
def _open_index_file_as_stdout(self, arcdir, index_name):
path = os.path.join(arcdir, index_name + self.INDEX_EXT)
omask = os.umask(002)
try:
self.__f = open(path, 'w')
finally:
os.umask(omask)
self.__stdout = sys.stdout
sys.stdout = self.__f
def _restore_stdout(self):
sys.stdout = self.__stdout
self.__f.close()
del self.__f
del self.__stdout
def update_dirty_archives(self):
for i in self._dirty_archives:
self.update_archive(i)
self._dirty_archives = []
def _makeArticle(self, msg, sequence):
return Article(msg, sequence)
def processUnixMailbox(self, input, start=None, end=None):
mbox = ArchiverMailbox(input, self.maillist)
if start is None:
start = 0
counter = 0
while counter < start:
try:
m = mbox.next()
except Errors.DiscardMessage:
continue
if m is None:
return
counter += 1
while 1:
try:
pos = input.tell()
m = mbox.next()
except Errors.DiscardMessage:
continue
except Exception:
syslog('error', 'uncaught archiver exception at filepos: %s',
pos)
raise
if m is None:
break
if m == '':
continue
msgid = m.get('message-id', 'n/a')
self.message(_('#%(counter)05d %(msgid)s'))
a = self._makeArticle(m, self.sequence)
self.sequence += 1
self.add_article(a)
if end is not None and counter >= end:
break
counter += 1
def new_archive(self, archive, archivedir):
self.archives.append(archive)
self.update_TOC = 1
self.database.newArchive(archive)
try:
os.stat(archivedir)
except os.error, errdata:
errno, errmsg = errdata
if errno == 2:
omask = os.umask(0)
try:
os.mkdir(archivedir, self.DIRMODE)
finally:
os.umask(omask)
else:
raise os.error, errdata
self.open_new_archive(archive, archivedir)
def add_article(self, article):
archives = self.get_archives(article)
if not archives:
return
if type(archives) == type(''):
archives = [archives]
article.filename = filename = self.get_filename(article)
temp = self.format_article(article)
for arch in archives:
self.archive = arch archivedir = os.path.join(self.basedir, arch)
if arch not in self.archives:
self.new_archive(arch, archivedir)
self.write_article(arch, temp, os.path.join(archivedir,
filename))
if article.decoded.has_key('author'):
author = fixAuthor(article.decoded['author'])
else:
author = fixAuthor(article.author)
if article.decoded.has_key('stripped'):
subject = article.decoded['stripped'].lower()
else:
subject = article.subject.lower()
article.parentID = parentID = self.get_parent_info(arch, article)
if parentID:
parent = self.database.getArticle(arch, parentID)
article.threadKey = (parent.threadKey + article.date + '.'
+ str(article.sequence) + '-')
else:
article.threadKey = (article.date + '.'
+ str(article.sequence) + '-')
key = article.threadKey, article.msgid
self.database.setThreadKey(arch, key, article.msgid)
self.database.addArticle(arch, temp, author=author,
subject=subject)
if arch not in self._dirty_archives:
self._dirty_archives.append(arch)
def get_parent_info(self, archive, article):
parentID = None
if article.in_reply_to:
parentID = article.in_reply_to
elif article.references:
refs = self._remove_external_references(article.references)
if refs:
maxdate = self.database.getArticle(archive, refs[0])
for ref in refs[1:]:
a = self.database.getArticle(archive, ref)
if a.date > maxdate.date:
maxdate = a
parentID = maxdate.msgid
else:
parentID = self.database.getOldestArticle(archive,
article.subject)
if parentID and not self.database.hasArticle(archive, parentID):
parentID = None
return parentID
def write_article(self, index, article, path):
omask = os.umask(002)
try:
f = open(path, 'w')
finally:
os.umask(omask)
temp_stdout, sys.stdout = sys.stdout, f
self.write_article_header(article)
sys.stdout.writelines(article.body)
self.write_article_footer(article)
sys.stdout = temp_stdout
f.close()
def _remove_external_references(self, refs):
keep = []
for ref in refs:
if self.database.hasArticle(self.archive, ref):
keep.append(ref)
return keep
def get_filename(self, article):
pass
def get_archives(self, article):
"""Return a list of indexes where the article should be filed.
A string can be returned if the list only contains one entry,
and the empty list is legal."""
pass
def format_article(self, article):
pass
def write_index_header(self):
pass
def write_index_footer(self):
pass
def write_index_entry(self, article):
pass
def write_threadindex_entry(self, article, depth):
pass
def write_article_header(self, article):
pass
def write_article_footer(self, article):
pass
def write_article_entry(self, article):
pass
def update_article(self, archivedir, article, prev, next):
pass
def write_TOC(self):
pass
def open_new_archive(self, archive, dir):
pass
def message(self, msg):
pass
class BSDDBdatabase(Database):
__super_addArticle = Database.addArticle
def __init__(self, basedir):
self.__cachekeys = []
self.__cachedict = {}
self.__currentOpenArchive = None self.basedir = os.path.expanduser(basedir)
self.changed = {}
def firstdate(self, archive):
self.__openIndices(archive)
date = 'None'
try:
date, msgid = self.dateIndex.first()
date = time.asctime(time.localtime(float(date)))
except KeyError:
pass
return date
def lastdate(self, archive):
self.__openIndices(archive)
date = 'None'
try:
date, msgid = self.dateIndex.last()
date = time.asctime(time.localtime(float(date)))
except KeyError:
pass
return date
def numArticles(self, archive):
self.__openIndices(archive)
return len(self.dateIndex)
def addArticle(self, archive, article, subject=None, author=None,
date=None):
self.__openIndices(archive)
self.__super_addArticle(archive, article, subject, author, date)
def __openIndices(self, archive):
if self.__currentOpenArchive == archive:
return
import bsddb
self.__closeIndices()
arcdir = os.path.join(self.basedir, 'database')
omask = os.umask(0)
try:
try:
os.mkdir(arcdir, 02775)
except OSError:
pass
finally:
os.umask(omask)
for hdr in ('date', 'author', 'subject', 'article', 'thread'):
path = os.path.join(arcdir, archive + '-' + hdr)
t = bsddb.btopen(path, 'c')
setattr(self, hdr + 'Index', t)
self.__currentOpenArchive = archive
def __closeIndices(self):
if self.__currentOpenArchive is not None:
pass
for hdr in ('date', 'author', 'subject', 'thread', 'article'):
attr = hdr + 'Index'
if hasattr(self, attr):
index = getattr(self, attr)
if hdr == 'article':
if not hasattr(self, 'archive_length'):
self.archive_length = {}
self.archive_length[self.__currentOpenArchive] = len(index)
index.close()
delattr(self,attr)
self.__currentOpenArchive = None
def close(self):
self.__closeIndices()
def hasArticle(self, archive, msgid):
self.__openIndices(archive)
return self.articleIndex.has_key(msgid)
def setThreadKey(self, archive, key, msgid):
self.__openIndices(archive)
self.threadIndex[key] = msgid
def getArticle(self, archive, msgid):
self.__openIndices(archive)
if self.__cachedict.has_key(msgid):
self.__cachekeys.remove(msgid)
self.__cachekeys.append(msgid)
return self.__cachedict[msgid]
if len(self.__cachekeys) == CACHESIZE:
delkey, self.__cachekeys = (self.__cachekeys[0],
self.__cachekeys[1:])
del self.__cachedict[delkey]
s = self.articleIndex[msgid]
article = pickle.loads(s)
self.__cachekeys.append(msgid)
self.__cachedict[msgid] = article
return article
def first(self, archive, index):
self.__openIndices(archive)
index = getattr(self, index+'Index')
try:
key, msgid = index.first()
return msgid
except KeyError:
return None
def next(self, archive, index):
self.__openIndices(archive)
index = getattr(self, index+'Index')
try:
key, msgid = index.next()
except KeyError:
return None
else:
return msgid
def getOldestArticle(self, archive, subject):
self.__openIndices(archive)
subject = subject.lower()
try:
key, tempid = self.subjectIndex.set_location(subject)
self.subjectIndex.next()
[subject2, date] = key.split('\0')
if subject != subject2:
return None
return tempid
except KeyError: return None
def newArchive(self, archive):
pass
def clearIndex(self, archive, index):
self.__openIndices(archive)
index = getattr(self, index+'Index')
finished = 0
try:
key, msgid = self.threadIndex.first()
except KeyError:
finished = 1
while not finished:
del self.threadIndex[key]
try:
key, msgid = self.threadIndex.next()
except KeyError:
finished = 1