Script to automatically process the NL bounces
authorAymeric Augustin <aymeric.augustin@m4x.org>
Sun, 2 Nov 2008 21:27:45 +0000 (22:27 +0100)
committerAymeric Augustin <aymeric.augustin@m4x.org>
Sun, 2 Nov 2008 21:27:45 +0000 (22:27 +0100)
bin/newsletter.bounces.processor.py [new file with mode: 0755]

diff --git a/bin/newsletter.bounces.processor.py b/bin/newsletter.bounces.processor.py
new file mode 100755 (executable)
index 0000000..e3972d6
--- /dev/null
@@ -0,0 +1,374 @@
+#!/usr/bin/env python2.5
+# -*- coding: utf-8 -*-
+#***************************************************************************
+#*  Copyright (C) 2004-2008 polytechnique.org                              *
+#*  http://opensource.polytechnique.org/                                   *
+#*                                                                         *
+#*  This program is free software; you can redistribute it and/or modify   *
+#*  it under the terms of the GNU General Public License as published by   *
+#*  the Free Software Foundation; either version 2 of the License, or      *
+#*  (at your option) any later version.                                    *
+#*                                                                         *
+#*  This program is distributed in the hope that it will be useful,        *
+#*  but WITHOUT ANY WARRANTY; without even the implied warranty of         *
+#*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
+#*  GNU General Public License for more details.                           *
+#*                                                                         *
+#*  You should have received a copy of the GNU General Public License      *
+#*  along with this program; if not, write to the Free Software            *
+#*  Foundation, Inc.,                                                      *
+#*  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA                *
+#***************************************************************************
+
+# Copyright (c) 2008 Aymeric Augustin
+
+"""
+Process as automatically as possible bounces from the newsletter
+
+The goal is to extract the email adresses that actually bounced.
+Bounces conforming to RFC 1894 will be automatically processed.
+
+This script uses the X-Spam-Flag header to remove spam and heuristics
+to detect out-of-office auto-replies and delivery status notifications.
+
+All emails are saved in different mailboxes to make human post-processing easier.
+"""
+
+import email, mailbox, os, re, sys, time
+
+#----------------------------------------------------------------------------#
+
+class MboxProcessor:
+    """Applies a series of filters to each message in a mbox."""
+
+    def __init__(self, mbox):
+        self.mbox_file = mbox
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.filters = [
+            DirectBouncesFilter(),
+            SpamFilter(),
+            UnsureFilter(),
+            CheckNonSpamFilter(),
+            OutOfOfficeFilter(),
+            DeliveryStatusNotificationFilter(),
+            CatchAllFilter()
+        ]
+
+    def initialize_filters(self):
+        for f in self.filters: f.initialize(self.mbox_file)
+        self.start_time = time.clock()
+
+    def apply_filters(self, message):
+        return any(f.process(message) for f in self.filters)
+
+    def finalize_filters(self):
+        duration = time.clock() - self.start_time
+        separator = '-' * 80
+        print separator
+        print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
+        print separator
+        for f in self.filters:
+            f.finalize();
+            print separator
+
+    def run(self):
+        self.mbox.lock()
+        try:
+            self.initialize_filters()
+            for message in self.mbox: self.apply_filters(message)
+            self.finalize_filters()
+        finally:
+            self.mbox.unlock()
+            self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class MboxFilter:
+    """Defines an interface for filters."""
+
+    def initialize(self, mbox_file):
+        """Called by the processor before processing starts.
+        
+        This is the place to open descriptors required during processing."""
+        pass
+
+    def process(self, message):
+        """Called by the processor for each message that reaches this step.
+        
+        Return true to stop processing, and false to go to the next filter."""
+        pass
+
+    def finalize(self):
+        """Called by the processor after processing ends.
+        
+        This is the place to display the results and close all descriptors."""
+        pass
+
+#----------------------------------------------------------------------------#
+
+def findSubject(message):
+    """Returns the subject of an email.Message as an unicode string."""
+    if message['Subject'] is not None:
+        try:
+            return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
+        except:
+            pass
+    return None
+
+_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
+
+def findAddressInBounce(bounce):
+    """Finds the faulty email address in a bounced email.
+    
+    See RFC 1894 for more information.
+    Returns None or the email address."""
+    # Check that it is a bounce - a few MTA fail to set this correctly :(
+    if bounce.get_content_type() != 'multipart/report':
+        print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
+        return None
+    # Extract the second component of the multipart/report
+    if len(bounce.get_payload()) < 2:
+        print '! Not a valid bounce (expected at least 2 parts, found %d).' % len(bounce)
+        return None
+    status = bounce.get_payload(1)
+    if status.get_content_type() != 'message/delivery-status':
+        print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
+        return None
+    # The per-message-fields don't matter here, get only the per-recipient-fields
+    if len(status.get_payload()) < 2:
+        print '! Not a valid bounce (expected at least 2 parts, found %d).' % len(status)
+        return None
+    content = status.get_payload(1)
+    if content.get_content_type() != 'text/plain':
+        print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
+        return None
+    # Extract the faulty email address
+    recipient_match = _recipient_re.search(content['Final-Recipient'])
+    if recipient_match is None:
+        print '! Missing final recipient.'
+        return None
+    email = recipient_match.group(1)
+    # Check the action field
+    if content['Action'] != 'failed':
+        print '! Not a failed action (%s).' % content['Action']
+        return None
+    # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
+    # Otherwise, the first sub-field should indicate a permanent failure
+    postfix_error = content['Diagnostic-Code'] is not None \
+                and content['Diagnostic-Code'].startswith('X-Postfix')
+    if not postfix_error and int(content['Status'][:1]) != 5:
+        print '! Not a permanent failure status (%s).' % content['Status']
+        return None
+    return email
+
+#----------------------------------------------------------------------------#
+
+class DirectBouncesFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.seen = 0
+        self.emails = []
+        self.mbox_file = '%s.bounced' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+
+    def process(self, message):
+        if message['X-Spam-Flag'] is None:
+            # During finalization, we will verifiy that all messages were processed
+            self.seen += 1
+            # Additionnal checks, just to be sure
+            if message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
+            or message['Subject'] != 'Undelivered Mail Returned to Sender':
+                return False
+            email = findAddressInBounce(message)
+            if email is not None:
+                self.emails.append(email)
+                self.mbox.add(message)
+                return True
+        return False
+
+    def finalize(self):
+        print 'Found %d messages with no X-Spam-Flag header.' % self.seen
+        print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
+        if self.seen != len(self.mbox):
+            print '  /!\ These numbers shoud be equal! We have a problem! /!\\'
+        print 'They were saved in %s.' % self.mbox_file
+        print ''
+        print 'Here is the list of email adresses for these bounces:'
+        print ''
+        for email in self.emails:
+            print email
+        print ''
+        self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class SpamFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.mbox_file = '%s.spam' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+
+    def process(self, message):
+        if message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
+            self.mbox.add(message)
+            return True
+        return False
+
+    def finalize(self):
+        print 'Found %d spams. This is reliable.' % len(self.mbox)
+        print 'They were saved in %s.' % self.mbox_file
+        print 'You might check the contents of this mbox.'
+        self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class UnsureFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.mbox_file = '%s.unsure' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+
+    def process(self, message):
+        if message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
+            self.mbox.add(message)
+            return True
+        return False
+
+    def finalize(self):
+        print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
+        print 'They were saved in %s.' % self.mbox_file
+        print 'You must check the contents of this mbox and feed the antispam.'
+        self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class CheckNonSpamFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.seen = 0
+
+    def process(self, message):
+        if not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
+            self.seen += 1
+        return False
+
+    def finalize(self):
+        if self.seen > 0:
+            print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.counter
+            print 'Please investigate.'
+        else:
+            print 'All messages were either spam, or unsure, or non-spams. Good.'
+
+#----------------------------------------------------------------------------#
+
+class OutOfOfficeFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.mbox_file = '%s.ooo' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+        subject_re = [
+            r'^Absen(t|ce)',
+            r'^Out of office',
+            r'est absent',
+            r'is out of (the )?office',
+            u'^RĂ©ponse automatique d\'absence du bureau', # unicode!
+        ]
+        self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
+
+    def process(self, message):
+        subject = findSubject(message)
+        if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
+            self.mbox.add(message)
+            return True
+        return False
+
+    def finalize(self):
+        print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
+        print 'They were saved in %s.' % self.mbox_file
+        print 'You may check the contents of this mbox.'
+        self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class DeliveryStatusNotificationFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.emails = []
+        self.mbox_file = '%s.dsn' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+        subject_re = [
+            r'^DELIVERY FAILURE: ',
+            r'^Delivery Notification: Delivery has failed$',
+            r'^Delivery Status Notification ?\(Failure\)$',
+            r'^Mail delivery failed',
+            r'^(Mail revenu en erreur / )?Undelivered Mail Returned to Sender$',
+            r'^Returned mail: see transcript for details$',
+            r'^Undeliverable( mail)?:',
+            r'^Undelivered Mail Returned to Sender$',
+        ]
+        self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
+
+    def process(self, message):
+        subject = findSubject(message)
+        if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
+            email = findAddressInBounce(message)
+            if email is not None:
+                self.emails.append(email)
+                self.mbox.add(message)
+                return True
+        return False
+
+    def finalize(self):
+        print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
+        print 'They were saved in %s.' % self.mbox_file
+        print ''
+        print 'Here is the list of email adresses for these bounces:'
+        print ''
+        for email in self.emails:
+            print email
+        print ''
+        self.mbox.close()
+
+#----------------------------------------------------------------------------#
+
+class CatchAllFilter(MboxFilter):
+
+    def initialize(self, mbox_file):
+        self.mbox_file = '%s.catchall' % mbox_file
+        self.mbox = mailbox.mbox(self.mbox_file)
+        self.mbox.clear()
+
+    def process(self, message):
+        self.mbox.add(message)
+        return True
+
+    def finalize(self):
+        if len(self.mbox) > 0:
+            print '%d messages reached the catchall.' % len(self.mbox)
+            print 'They were saved in %s.' % self.mbox_file
+            print 'You must process the contents of this mbox manually.'
+            self.mbox.close()
+        else:
+            print 'No messages reached the catchall. Nice.'
+            self.mbox.close()
+            os.unlink(self.mbox_file)
+
+#----------------------------------------------------------------------------#
+
+if __name__ == '__main__':
+
+    if len(sys.argv) != 2:
+        print 'Usage: %s mbox' % sys.argv[0]
+        sys.exit(1)
+
+    if not os.path.exists(sys.argv[1]):
+        print 'No such file: %s' % sys.argv[1]
+        sys.exit(1)
+
+    processor = MboxProcessor(sys.argv[1])
+    processor.run()