| 1 | #!/usr/bin/env python2.5 |
| 2 | # -*- coding: utf-8 -*- |
| 3 | #*************************************************************************** |
| 4 | #* Copyright (C) 2003-2011 Polytechnique.org * |
| 5 | #* http://opensource.polytechnique.org/ * |
| 6 | #* * |
| 7 | #* This program is free software; you can redistribute it and/or modify * |
| 8 | #* it under the terms of the GNU General Public License as published by * |
| 9 | #* the Free Software Foundation; either version 2 of the License, or * |
| 10 | #* (at your option) any later version. * |
| 11 | #* * |
| 12 | #* This program is distributed in the hope that it will be useful, * |
| 13 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
| 14 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
| 15 | #* GNU General Public License for more details. * |
| 16 | #* * |
| 17 | #* You should have received a copy of the GNU General Public License * |
| 18 | #* along with this program; if not, write to the Free Software * |
| 19 | #* Foundation, Inc., * |
| 20 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * |
| 21 | #*************************************************************************** |
| 22 | |
| 23 | """ |
| 24 | Process as automatically as possible bounces from the newsletter |
| 25 | |
| 26 | The goal is to extract the email adresses that actually bounced. |
| 27 | Bounces conforming to RFC 1894 will be automatically processed. |
| 28 | |
| 29 | This script uses the X-Spam-Flag header to remove spam and heuristics |
| 30 | to detect out-of-office auto-replies and delivery status notifications. |
| 31 | |
| 32 | All emails are saved in different mailboxes to make human post-processing easier. |
| 33 | """ |
| 34 | |
| 35 | import email, mailbox, os, re, sys, time |
| 36 | |
| 37 | #----------------------------------------------------------------------------# |
| 38 | |
| 39 | class MboxProcessor: |
| 40 | """Applies a series of filters to each message in a mbox.""" |
| 41 | |
| 42 | def __init__(self, mbox): |
| 43 | self.mbox_file = mbox |
| 44 | self.mbox = mailbox.mbox(self.mbox_file) |
| 45 | self.filters = [ |
| 46 | DirectBouncesFilter(), |
| 47 | SpamFilter(), |
| 48 | UnsureFilter(), |
| 49 | CheckNonSpamFilter(), |
| 50 | OutOfOfficeFilter(), |
| 51 | DeliveryStatusNotificationFilter(), |
| 52 | CatchAllFilter() |
| 53 | ] |
| 54 | |
| 55 | def initialize_filters(self): |
| 56 | for f in self.filters: f.initialize(self.mbox_file) |
| 57 | self.start_time = time.clock() |
| 58 | |
| 59 | def apply_filters(self, message): |
| 60 | return any(f.process(message) for f in self.filters) |
| 61 | |
| 62 | def finalize_filters(self): |
| 63 | duration = time.clock() - self.start_time |
| 64 | separator = '-' * 80 |
| 65 | print separator |
| 66 | print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration) |
| 67 | print separator |
| 68 | for f in self.filters: |
| 69 | f.finalize(); |
| 70 | print separator |
| 71 | |
| 72 | def run(self): |
| 73 | self.mbox.lock() |
| 74 | try: |
| 75 | self.initialize_filters() |
| 76 | for message in self.mbox: self.apply_filters(message) |
| 77 | self.finalize_filters() |
| 78 | finally: |
| 79 | self.mbox.unlock() |
| 80 | self.mbox.close() |
| 81 | |
| 82 | #----------------------------------------------------------------------------# |
| 83 | |
| 84 | class MboxFilter: |
| 85 | """Defines an interface for filters.""" |
| 86 | |
| 87 | def initialize(self, mbox_file): |
| 88 | """Called by the processor before processing starts. |
| 89 | |
| 90 | This is the place to open descriptors required during processing.""" |
| 91 | pass |
| 92 | |
| 93 | def process(self, message): |
| 94 | """Called by the processor for each message that reaches this step. |
| 95 | |
| 96 | Return true to stop processing, and false to go to the next filter.""" |
| 97 | pass |
| 98 | |
| 99 | def finalize(self): |
| 100 | """Called by the processor after processing ends. |
| 101 | |
| 102 | This is the place to display the results and close all descriptors.""" |
| 103 | pass |
| 104 | |
| 105 | #----------------------------------------------------------------------------# |
| 106 | |
| 107 | def findSubject(message): |
| 108 | """Returns the subject of an email.Message as an unicode string.""" |
| 109 | if message['Subject'] is not None: |
| 110 | try: |
| 111 | return unicode(email.header.make_header(email.header.decode_header(message['Subject']))) |
| 112 | except: |
| 113 | pass |
| 114 | return None |
| 115 | |
| 116 | _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U) |
| 117 | |
| 118 | def findAddressInBounce(bounce): |
| 119 | """Finds the faulty email address in a bounced email. |
| 120 | |
| 121 | See RFC 1894 for more information. |
| 122 | Returns None or the email address.""" |
| 123 | # Check that it is a bounce - a few MTA fail to set this correctly :( |
| 124 | if bounce.get_content_type() != 'multipart/report': |
| 125 | print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type() |
| 126 | return None |
| 127 | # Extract the second component of the multipart/report |
| 128 | num_payloads = len(bounce.get_payload()) |
| 129 | if num_payloads < 2: |
| 130 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads |
| 131 | return None |
| 132 | status = bounce.get_payload(1) |
| 133 | if status.get_content_type() != 'message/delivery-status': |
| 134 | print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type() |
| 135 | return None |
| 136 | # The per-message-fields don't matter here, get only the per-recipient-fields |
| 137 | num_payloads = len(status.get_payload()) |
| 138 | if num_payloads < 2: |
| 139 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads |
| 140 | return None |
| 141 | content = status.get_payload(1) |
| 142 | if content.get_content_type() != 'text/plain': |
| 143 | print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type |
| 144 | return None |
| 145 | # Extract the faulty email address |
| 146 | recipient_match = _recipient_re.search(content['Final-Recipient']) |
| 147 | if recipient_match is None: |
| 148 | print '! Missing final recipient.' |
| 149 | return None |
| 150 | email = recipient_match.group(1) |
| 151 | # Check the action field |
| 152 | if content['Action'] != 'failed': |
| 153 | print '! Not a failed action (%s).' % content['Action'] |
| 154 | return None |
| 155 | # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors |
| 156 | # Otherwise, the first sub-field should indicate a permanent failure |
| 157 | postfix_error = content['Diagnostic-Code'] is not None \ |
| 158 | and content['Diagnostic-Code'].startswith('X-Postfix') |
| 159 | if not postfix_error and int(content['Status'][:1]) != 5: |
| 160 | print '! Not a permanent failure status (%s).' % content['Status'] |
| 161 | return None |
| 162 | return email |
| 163 | |
| 164 | #----------------------------------------------------------------------------# |
| 165 | |
| 166 | class DirectBouncesFilter(MboxFilter): |
| 167 | |
| 168 | def initialize(self, mbox_file): |
| 169 | self.seen = 0 |
| 170 | self.emails = [] |
| 171 | self.mbox_file = '%s.bounced' % mbox_file |
| 172 | self.mbox = mailbox.mbox(self.mbox_file) |
| 173 | self.mbox.clear() |
| 174 | |
| 175 | def process(self, message): |
| 176 | if message['X-Spam-Flag'] is None: |
| 177 | # During finalization, we will verifiy that all messages were processed |
| 178 | self.seen += 1 |
| 179 | # Special case: ignore mailman notifications for the mailing-list |
| 180 | # on which the NL is forwarded |
| 181 | if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org': |
| 182 | print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.' |
| 183 | self.seen -= 1 |
| 184 | return True |
| 185 | # Additionnal checks, just to be sure |
| 186 | elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \ |
| 187 | or message['Subject'] != 'Undelivered Mail Returned to Sender': |
| 188 | print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']) |
| 189 | else: |
| 190 | email = findAddressInBounce(message) |
| 191 | if email is not None: |
| 192 | self.emails.append(email) |
| 193 | self.mbox.add(message) |
| 194 | return True |
| 195 | else: |
| 196 | print '! No email found in direct bounce, this is really bad.' |
| 197 | return False |
| 198 | |
| 199 | def finalize(self): |
| 200 | print 'Found %d messages with no X-Spam-Flag header.' % self.seen |
| 201 | print 'Found %d of them that are confirmed bounces.' % len(self.mbox) |
| 202 | if self.seen != len(self.mbox): |
| 203 | print ' /!\ These numbers shoud be equal! We have a problem! /!\\' |
| 204 | print 'They were saved in %s.' % self.mbox_file |
| 205 | print '' |
| 206 | print 'Here is the list of email adresses for these bounces:' |
| 207 | print '' |
| 208 | for email in self.emails: |
| 209 | print email |
| 210 | print '' |
| 211 | self.mbox.close() |
| 212 | |
| 213 | #----------------------------------------------------------------------------# |
| 214 | |
| 215 | class SpamFilter(MboxFilter): |
| 216 | |
| 217 | def initialize(self, mbox_file): |
| 218 | self.mbox_file = '%s.spam' % mbox_file |
| 219 | self.mbox = mailbox.mbox(self.mbox_file) |
| 220 | self.mbox.clear() |
| 221 | |
| 222 | def process(self, message): |
| 223 | if message['X-Spam-Flag'] is not None \ |
| 224 | and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'): |
| 225 | self.mbox.add(message) |
| 226 | return True |
| 227 | return False |
| 228 | |
| 229 | def finalize(self): |
| 230 | print 'Found %d spams. This is reliable.' % len(self.mbox) |
| 231 | print 'They were saved in %s.' % self.mbox_file |
| 232 | print 'You might check the contents of this mbox.' |
| 233 | self.mbox.close() |
| 234 | |
| 235 | #----------------------------------------------------------------------------# |
| 236 | |
| 237 | class UnsureFilter(MboxFilter): |
| 238 | |
| 239 | def initialize(self, mbox_file): |
| 240 | self.mbox_file = '%s.unsure' % mbox_file |
| 241 | self.mbox = mailbox.mbox(self.mbox_file) |
| 242 | self.mbox.clear() |
| 243 | |
| 244 | def process(self, message): |
| 245 | if message['X-Spam-Flag'] is not None \ |
| 246 | and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'): |
| 247 | self.mbox.add(message) |
| 248 | return True |
| 249 | return False |
| 250 | |
| 251 | def finalize(self): |
| 252 | print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox) |
| 253 | print 'They were saved in %s.' % self.mbox_file |
| 254 | print 'You must check the contents of this mbox and feed the antispam.' |
| 255 | self.mbox.close() |
| 256 | |
| 257 | #----------------------------------------------------------------------------# |
| 258 | |
| 259 | class CheckNonSpamFilter(MboxFilter): |
| 260 | |
| 261 | def initialize(self, mbox_file): |
| 262 | self.seen = 0 |
| 263 | |
| 264 | def process(self, message): |
| 265 | if message['X-Spam-Flag'] is None \ |
| 266 | or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'): |
| 267 | self.seen += 1 |
| 268 | return False |
| 269 | |
| 270 | def finalize(self): |
| 271 | if self.seen > 0: |
| 272 | print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen |
| 273 | print 'Please investigate.' |
| 274 | else: |
| 275 | print 'All messages were either spam, or unsure, or non-spams. Good.' |
| 276 | |
| 277 | #----------------------------------------------------------------------------# |
| 278 | |
| 279 | class OutOfOfficeFilter(MboxFilter): |
| 280 | |
| 281 | def initialize(self, mbox_file): |
| 282 | self.mbox_file = '%s.ooo' % mbox_file |
| 283 | self.mbox = mailbox.mbox(self.mbox_file) |
| 284 | self.mbox.clear() |
| 285 | subject_re = [ |
| 286 | r'^Absen(t|ce)', |
| 287 | r'(est|is) absent', |
| 288 | r'^Out of (the )?office', |
| 289 | r'is out of (the )?office', |
| 290 | r'I am out of town', |
| 291 | r'automatique d\'absence', |
| 292 | r'Notification d\'absence' |
| 293 | u'RĂ©ponse automatique :', #unicode! |
| 294 | r'AutoReply', |
| 295 | ] |
| 296 | self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re)) |
| 297 | |
| 298 | def process(self, message): |
| 299 | subject = findSubject(message) |
| 300 | if subject is not None and any(regex.search(subject) for regex in self.subject_regexes): |
| 301 | self.mbox.add(message) |
| 302 | return True |
| 303 | return False |
| 304 | |
| 305 | def finalize(self): |
| 306 | print 'Found %d "out of office". This is generally reliable.' % len(self.mbox) |
| 307 | print 'They were saved in %s.' % self.mbox_file |
| 308 | print 'You may check the contents of this mbox.' |
| 309 | self.mbox.close() |
| 310 | |
| 311 | #----------------------------------------------------------------------------# |
| 312 | |
| 313 | class DeliveryStatusNotificationFilter(MboxFilter): |
| 314 | |
| 315 | def initialize(self, mbox_file): |
| 316 | self.emails = [] |
| 317 | self.mbox_file = '%s.dsn' % mbox_file |
| 318 | self.mbox = mailbox.mbox(self.mbox_file) |
| 319 | self.mbox.clear() |
| 320 | |
| 321 | def process(self, message): |
| 322 | if message.get_content_type() == 'multipart/report': |
| 323 | email = findAddressInBounce(message) |
| 324 | if email is not None: |
| 325 | self.emails.append(email) |
| 326 | self.mbox.add(message) |
| 327 | return True |
| 328 | return False |
| 329 | |
| 330 | def finalize(self): |
| 331 | print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox) |
| 332 | print 'They were saved in %s.' % self.mbox_file |
| 333 | print '' |
| 334 | print 'Here is the list of email adresses for these bounces:' |
| 335 | print '' |
| 336 | for email in self.emails: |
| 337 | print email |
| 338 | print '' |
| 339 | self.mbox.close() |
| 340 | |
| 341 | #----------------------------------------------------------------------------# |
| 342 | |
| 343 | class CatchAllFilter(MboxFilter): |
| 344 | |
| 345 | def initialize(self, mbox_file): |
| 346 | self.mbox_file = '%s.catchall' % mbox_file |
| 347 | self.mbox = mailbox.mbox(self.mbox_file) |
| 348 | self.mbox.clear() |
| 349 | |
| 350 | def process(self, message): |
| 351 | self.mbox.add(message) |
| 352 | return True |
| 353 | |
| 354 | def finalize(self): |
| 355 | if len(self.mbox) > 0: |
| 356 | print '%d messages reached the catchall.' % len(self.mbox) |
| 357 | print 'They were saved in %s.' % self.mbox_file |
| 358 | print 'You must process the contents of this mbox manually.' |
| 359 | self.mbox.close() |
| 360 | else: |
| 361 | print 'No messages reached the catchall. Nice.' |
| 362 | self.mbox.close() |
| 363 | os.unlink(self.mbox_file) |
| 364 | |
| 365 | #----------------------------------------------------------------------------# |
| 366 | |
| 367 | if __name__ == '__main__': |
| 368 | |
| 369 | if len(sys.argv) != 2: |
| 370 | print 'Usage: %s mbox' % sys.argv[0] |
| 371 | sys.exit(1) |
| 372 | |
| 373 | if not os.path.exists(sys.argv[1]): |
| 374 | print 'No such file: %s' % sys.argv[1] |
| 375 | sys.exit(1) |
| 376 | |
| 377 | processor = MboxProcessor(sys.argv[1]) |
| 378 | processor.run() |