From 6208fd2648a31a35cc026fd1c7be0bbecf070c78 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sat, 18 May 2013 22:23:33 +0200 Subject: [PATCH] Improve newsletter.bounces.processor.py: * Make it Python 3.3 complient * Fix and add some filters for Out Of Office messages detection * Fix findSubject charset encoding exceptions Signed-off-by: Nicolas Iooss --- bin/newsletter.bounces.processor.py | 169 +++++++++++++++++++++--------------- 1 file changed, 98 insertions(+), 71 deletions(-) diff --git a/bin/newsletter.bounces.processor.py b/bin/newsletter.bounces.processor.py index f76ac4b..46d8a18 100755 --- a/bin/newsletter.bounces.processor.py +++ b/bin/newsletter.bounces.processor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2.5 +#!/usr/bin/env python # -*- coding: utf-8 -*- #*************************************************************************** #* Copyright (C) 2003-2013 Polytechnique.org * @@ -32,7 +32,12 @@ to detect out-of-office auto-replies and delivery status notifications. All emails are saved in different mailboxes to make human post-processing easier. """ -import email, mailbox, os, re, sys, time +import email +import mailbox +import os +import re +import sys +import time #----------------------------------------------------------------------------# @@ -62,12 +67,12 @@ class MboxProcessor: def finalize_filters(self): duration = time.clock() - self.start_time separator = '-' * 80 - print separator - print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration) - print separator + print(separator) + print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)) + print(separator) for f in self.filters: - f.finalize(); - print separator + f.finalize() + print(separator) def run(self): self.mbox.lock() @@ -86,19 +91,19 @@ class MboxFilter: def initialize(self, mbox_file): """Called by the processor before processing starts. - + This is the place to open descriptors required during processing.""" pass def process(self, message): """Called by the processor for each message that reaches this step. - + Return true to stop processing, and false to go to the next filter.""" pass def finalize(self): """Called by the processor after processing ends. - + This is the place to display the results and close all descriptors.""" pass @@ -106,58 +111,64 @@ class MboxFilter: def findSubject(message): """Returns the subject of an email.Message as an unicode string.""" - if message['Subject'] is not None: - try: - return unicode(email.header.make_header(email.header.decode_header(message['Subject']))) - except: - pass - return None + if message['Subject'] is None: + return None + + # decode_header returns a list of (decoded_string, charset) pairs + decoded_seq = email.header.decode_header(message['Subject']) + decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq] + header = email.header.make_header(decoded_seq) + # Be Python 2 & 3 compatible + return unicode(header) if sys.version_info < (3,) else str(header) + _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U) + def findAddressInBounce(bounce): """Finds the faulty email address in a bounced email. - + See RFC 1894 for more information. Returns None or the email address.""" + # Check that it is a bounce - a few MTA fail to set this correctly :( if bounce.get_content_type() != 'multipart/report': - print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type() + print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()) return None # Extract the second component of the multipart/report num_payloads = len(bounce.get_payload()) if num_payloads < 2: - print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads + print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads) return None status = bounce.get_payload(1) if status.get_content_type() != 'message/delivery-status': - print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type() + print('! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()) return None # The per-message-fields don't matter here, get only the per-recipient-fields num_payloads = len(status.get_payload()) if num_payloads < 2: - print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads + print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads) return None content = status.get_payload(1) if content.get_content_type() != 'text/plain': - print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type + print('! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type) return None # Extract the faulty email address recipient_match = _recipient_re.search(content['Final-Recipient']) if recipient_match is None: - print '! Missing final recipient.' + print('! Missing final recipient.') return None email = recipient_match.group(1) # Check the action field if content['Action'] != 'failed': - print '! Not a failed action (%s).' % content['Action'] + print('! Not a failed action (%s).' % content['Action']) return None # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors # Otherwise, the first sub-field should indicate a permanent failure postfix_error = content['Diagnostic-Code'] is not None \ and content['Diagnostic-Code'].startswith('X-Postfix') if not postfix_error and int(content['Status'][:1]) != 5: - print '! Not a permanent failure status (%s).' % content['Status'] + print('! Not a permanent failure status (%s).' % content['Status']) return None return email @@ -167,6 +178,7 @@ class DirectBouncesFilter(MboxFilter): def initialize(self, mbox_file): self.seen = 0 + self.bad_problems = 0 self.emails = [] self.mbox_file = '%s.bounced' % mbox_file self.mbox = mailbox.mbox(self.mbox_file) @@ -178,14 +190,14 @@ class DirectBouncesFilter(MboxFilter): self.seen += 1 # Special case: ignore mailman notifications for the mailing-list # on which the NL is forwarded - if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org': - print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.' + if message['From'] == 'newsletter-externes-bounces@polytechnique.org': + print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.') self.seen -= 1 return True # Additionnal checks, just to be sure elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \ or message['Subject'] != 'Undelivered Mail Returned to Sender': - print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']) + print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject'])) else: email = findAddressInBounce(message) if email is not None: @@ -193,21 +205,24 @@ class DirectBouncesFilter(MboxFilter): self.mbox.add(message) return True else: - print '! No email found in direct bounce, this is really bad.' + print('! => No email found in direct bounce, this is really bad.') + self.bad_problems += 1 return False def finalize(self): - print 'Found %d messages with no X-Spam-Flag header.' % self.seen - print 'Found %d of them that are confirmed bounces.' % len(self.mbox) - if self.seen != len(self.mbox): - print ' /!\ These numbers shoud be equal! We have a problem! /!\\' - print 'They were saved in %s.' % self.mbox_file - print '' - print 'Here is the list of email adresses for these bounces:' - print '' + print('Found %d messages with no X-Spam-Flag header.' % self.seen) + print('Found %d of them that are confirmed bounces.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + if self.bad_problems: + print('Found %d of them that are invalid.' % self.bad_problems) + if self.seen != len(self.mbox) + self.bad_problems: + print(' /!\ These numbers shoud be equal! We have a problem! /!\\') + print('') + print('Here is the list of email adresses for these bounces:') + print('') for email in self.emails: - print email - print '' + print(email) + print('') self.mbox.close() #----------------------------------------------------------------------------# @@ -227,9 +242,9 @@ class SpamFilter(MboxFilter): return False def finalize(self): - print 'Found %d spams. This is reliable.' % len(self.mbox) - print 'They were saved in %s.' % self.mbox_file - print 'You might check the contents of this mbox.' + print('Found %d spams. This is reliable.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + print('You might check the contents of this mbox.') self.mbox.close() #----------------------------------------------------------------------------# @@ -249,9 +264,9 @@ class UnsureFilter(MboxFilter): return False def finalize(self): - print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox) - print 'They were saved in %s.' % self.mbox_file - print 'You must check the contents of this mbox and feed the antispam.' + print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + print('You must check the contents of this mbox and feed the antispam.') self.mbox.close() #----------------------------------------------------------------------------# @@ -269,10 +284,10 @@ class CheckNonSpamFilter(MboxFilter): def finalize(self): if self.seen > 0: - print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen - print 'Please investigate.' + print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen) + print('Please investigate.') else: - print 'All messages were either spam, or unsure, or non-spams. Good.' + print('All messages were either spam, or unsure, or non-spams. Good.') #----------------------------------------------------------------------------# @@ -284,16 +299,18 @@ class OutOfOfficeFilter(MboxFilter): self.mbox.clear() subject_re = [ r'^Absen(t|ce)', - r'(est|is) absent', - r'^Out of (the )?office', - r'is out of (the )?office', - r'I am out of town', + r'^(AUTO: )?Out of (the )?office', + r'^Automatic reply: ', r'automatique d\'absence', - r'Notification d\'absence' - u'Réponse automatique :', #unicode! r'AutoReply', + r'(est|is) absent', + r'I am out of town', + r'I am currently away', + r'is out of (the )?office', + r'Notification d\'absence', + r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute ] - self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re)) + self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re] def process(self, message): subject = findSubject(message) @@ -303,9 +320,9 @@ class OutOfOfficeFilter(MboxFilter): return False def finalize(self): - print 'Found %d "out of office". This is generally reliable.' % len(self.mbox) - print 'They were saved in %s.' % self.mbox_file - print 'You may check the contents of this mbox.' + print('Found %d "out of office". This is generally reliable.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + print('You may check the contents of this mbox.') self.mbox.close() #----------------------------------------------------------------------------# @@ -317,6 +334,9 @@ class DeliveryStatusNotificationFilter(MboxFilter): self.mbox_file = '%s.dsn' % mbox_file self.mbox = mailbox.mbox(self.mbox_file) self.mbox.clear() + self.mbox_temp_file = '%s.dsn-temp' % mbox_file + self.mbox_temp = mailbox.mbox(self.mbox_temp_file) + self.mbox_temp.clear() def process(self, message): if message.get_content_type() == 'multipart/report': @@ -325,18 +345,25 @@ class DeliveryStatusNotificationFilter(MboxFilter): self.emails.append(email) self.mbox.add(message) return True + else: + print("! => Moved to temporary DSN mailbox") + self.mbox_temp.add(message) + return True return False def finalize(self): - print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox) - print 'They were saved in %s.' % self.mbox_file - print '' - print 'Here is the list of email adresses for these bounces:' - print '' + print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + print('') + print('Here is the list of email adresses for these bounces:') + print('') for email in self.emails: - print email - print '' + print(email) + print('') self.mbox.close() + print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp)) + print('They were saved in %s.' % self.mbox_temp_file) + self.mbox_temp.close() #----------------------------------------------------------------------------# @@ -353,12 +380,12 @@ class CatchAllFilter(MboxFilter): def finalize(self): if len(self.mbox) > 0: - print '%d messages reached the catchall.' % len(self.mbox) - print 'They were saved in %s.' % self.mbox_file - print 'You must process the contents of this mbox manually.' + print('%d messages reached the catchall.' % len(self.mbox)) + print('They were saved in %s.' % self.mbox_file) + print('You must process the contents of this mbox manually.') self.mbox.close() else: - print 'No messages reached the catchall. Nice.' + print('No messages reached the catchall. Nice.') self.mbox.close() os.unlink(self.mbox_file) @@ -367,11 +394,11 @@ class CatchAllFilter(MboxFilter): if __name__ == '__main__': if len(sys.argv) != 2: - print 'Usage: %s mbox' % sys.argv[0] + print('Usage: %s mbox' % sys.argv[0]) sys.exit(1) if not os.path.exists(sys.argv[1]): - print 'No such file: %s' % sys.argv[1] + print('No such file: %s' % sys.argv[1]) sys.exit(1) processor = MboxProcessor(sys.argv[1]) -- 2.1.4