| 1 | #!/usr/bin/env python2.5 |
| 2 | # -*- coding: utf-8 -*- |
| 3 | #*************************************************************************** |
| 4 | #* Copyright (C) 2004-2008 polytechnique.org * |
| 5 | #* http://opensource.polytechnique.org/ * |
| 6 | #* * |
| 7 | #* This program is free software; you can redistribute it and/or modify * |
| 8 | #* it under the terms of the GNU General Public License as published by * |
| 9 | #* the Free Software Foundation; either version 2 of the License, or * |
| 10 | #* (at your option) any later version. * |
| 11 | #* * |
| 12 | #* This program is distributed in the hope that it will be useful, * |
| 13 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
| 14 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
| 15 | #* GNU General Public License for more details. * |
| 16 | #* * |
| 17 | #* You should have received a copy of the GNU General Public License * |
| 18 | #* along with this program; if not, write to the Free Software * |
| 19 | #* Foundation, Inc., * |
| 20 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * |
| 21 | #*************************************************************************** |
| 22 | |
| 23 | # Copyright (c) 2008 Aymeric Augustin |
| 24 | |
| 25 | """ |
| 26 | Process as automatically as possible bounces from the newsletter |
| 27 | |
| 28 | The goal is to extract the email adresses that actually bounced. |
| 29 | Bounces conforming to RFC 1894 will be automatically processed. |
| 30 | |
| 31 | This script uses the X-Spam-Flag header to remove spam and heuristics |
| 32 | to detect out-of-office auto-replies and delivery status notifications. |
| 33 | |
| 34 | All emails are saved in different mailboxes to make human post-processing easier. |
| 35 | """ |
| 36 | |
| 37 | import email, mailbox, os, re, sys, time |
| 38 | |
| 39 | #----------------------------------------------------------------------------# |
| 40 | |
| 41 | class MboxProcessor: |
| 42 | """Applies a series of filters to each message in a mbox.""" |
| 43 | |
| 44 | def __init__(self, mbox): |
| 45 | self.mbox_file = mbox |
| 46 | self.mbox = mailbox.mbox(self.mbox_file) |
| 47 | self.filters = [ |
| 48 | DirectBouncesFilter(), |
| 49 | SpamFilter(), |
| 50 | UnsureFilter(), |
| 51 | CheckNonSpamFilter(), |
| 52 | OutOfOfficeFilter(), |
| 53 | DeliveryStatusNotificationFilter(), |
| 54 | CatchAllFilter() |
| 55 | ] |
| 56 | |
| 57 | def initialize_filters(self): |
| 58 | for f in self.filters: f.initialize(self.mbox_file) |
| 59 | self.start_time = time.clock() |
| 60 | |
| 61 | def apply_filters(self, message): |
| 62 | return any(f.process(message) for f in self.filters) |
| 63 | |
| 64 | def finalize_filters(self): |
| 65 | duration = time.clock() - self.start_time |
| 66 | separator = '-' * 80 |
| 67 | print separator |
| 68 | print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration) |
| 69 | print separator |
| 70 | for f in self.filters: |
| 71 | f.finalize(); |
| 72 | print separator |
| 73 | |
| 74 | def run(self): |
| 75 | self.mbox.lock() |
| 76 | try: |
| 77 | self.initialize_filters() |
| 78 | for message in self.mbox: self.apply_filters(message) |
| 79 | self.finalize_filters() |
| 80 | finally: |
| 81 | self.mbox.unlock() |
| 82 | self.mbox.close() |
| 83 | |
| 84 | #----------------------------------------------------------------------------# |
| 85 | |
| 86 | class MboxFilter: |
| 87 | """Defines an interface for filters.""" |
| 88 | |
| 89 | def initialize(self, mbox_file): |
| 90 | """Called by the processor before processing starts. |
| 91 | |
| 92 | This is the place to open descriptors required during processing.""" |
| 93 | pass |
| 94 | |
| 95 | def process(self, message): |
| 96 | """Called by the processor for each message that reaches this step. |
| 97 | |
| 98 | Return true to stop processing, and false to go to the next filter.""" |
| 99 | pass |
| 100 | |
| 101 | def finalize(self): |
| 102 | """Called by the processor after processing ends. |
| 103 | |
| 104 | This is the place to display the results and close all descriptors.""" |
| 105 | pass |
| 106 | |
| 107 | #----------------------------------------------------------------------------# |
| 108 | |
| 109 | def findSubject(message): |
| 110 | """Returns the subject of an email.Message as an unicode string.""" |
| 111 | if message['Subject'] is not None: |
| 112 | try: |
| 113 | return unicode(email.header.make_header(email.header.decode_header(message['Subject']))) |
| 114 | except: |
| 115 | pass |
| 116 | return None |
| 117 | |
| 118 | _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U) |
| 119 | |
| 120 | def findAddressInBounce(bounce): |
| 121 | """Finds the faulty email address in a bounced email. |
| 122 | |
| 123 | See RFC 1894 for more information. |
| 124 | Returns None or the email address.""" |
| 125 | # Check that it is a bounce - a few MTA fail to set this correctly :( |
| 126 | if bounce.get_content_type() != 'multipart/report': |
| 127 | print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type() |
| 128 | return None |
| 129 | # Extract the second component of the multipart/report |
| 130 | num_payloads = len(bounce.get_payload()) |
| 131 | if num_payloads < 2: |
| 132 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads |
| 133 | return None |
| 134 | status = bounce.get_payload(1) |
| 135 | if status.get_content_type() != 'message/delivery-status': |
| 136 | print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type() |
| 137 | return None |
| 138 | # The per-message-fields don't matter here, get only the per-recipient-fields |
| 139 | num_payloads = len(status.get_payload()) |
| 140 | if num_payloads < 2: |
| 141 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads |
| 142 | return None |
| 143 | content = status.get_payload(1) |
| 144 | if content.get_content_type() != 'text/plain': |
| 145 | print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type |
| 146 | return None |
| 147 | # Extract the faulty email address |
| 148 | recipient_match = _recipient_re.search(content['Final-Recipient']) |
| 149 | if recipient_match is None: |
| 150 | print '! Missing final recipient.' |
| 151 | return None |
| 152 | email = recipient_match.group(1) |
| 153 | # Check the action field |
| 154 | if content['Action'] != 'failed': |
| 155 | print '! Not a failed action (%s).' % content['Action'] |
| 156 | return None |
| 157 | # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors |
| 158 | # Otherwise, the first sub-field should indicate a permanent failure |
| 159 | postfix_error = content['Diagnostic-Code'] is not None \ |
| 160 | and content['Diagnostic-Code'].startswith('X-Postfix') |
| 161 | if not postfix_error and int(content['Status'][:1]) != 5: |
| 162 | print '! Not a permanent failure status (%s).' % content['Status'] |
| 163 | return None |
| 164 | return email |
| 165 | |
| 166 | #----------------------------------------------------------------------------# |
| 167 | |
| 168 | class DirectBouncesFilter(MboxFilter): |
| 169 | |
| 170 | def initialize(self, mbox_file): |
| 171 | self.seen = 0 |
| 172 | self.emails = [] |
| 173 | self.mbox_file = '%s.bounced' % mbox_file |
| 174 | self.mbox = mailbox.mbox(self.mbox_file) |
| 175 | self.mbox.clear() |
| 176 | |
| 177 | def process(self, message): |
| 178 | if message['X-Spam-Flag'] is None: |
| 179 | # During finalization, we will verifiy that all messages were processed |
| 180 | self.seen += 1 |
| 181 | # Additionnal checks, just to be sure |
| 182 | if message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \ |
| 183 | or message['Subject'] != 'Undelivered Mail Returned to Sender': |
| 184 | return False |
| 185 | email = findAddressInBounce(message) |
| 186 | if email is not None: |
| 187 | self.emails.append(email) |
| 188 | self.mbox.add(message) |
| 189 | return True |
| 190 | return False |
| 191 | |
| 192 | def finalize(self): |
| 193 | print 'Found %d messages with no X-Spam-Flag header.' % self.seen |
| 194 | print 'Found %d of them that are confirmed bounces.' % len(self.mbox) |
| 195 | if self.seen != len(self.mbox): |
| 196 | print ' /!\ These numbers shoud be equal! We have a problem! /!\\' |
| 197 | print 'They were saved in %s.' % self.mbox_file |
| 198 | print '' |
| 199 | print 'Here is the list of email adresses for these bounces:' |
| 200 | print '' |
| 201 | for email in self.emails: |
| 202 | print email |
| 203 | print '' |
| 204 | self.mbox.close() |
| 205 | |
| 206 | #----------------------------------------------------------------------------# |
| 207 | |
| 208 | class SpamFilter(MboxFilter): |
| 209 | |
| 210 | def initialize(self, mbox_file): |
| 211 | self.mbox_file = '%s.spam' % mbox_file |
| 212 | self.mbox = mailbox.mbox(self.mbox_file) |
| 213 | self.mbox.clear() |
| 214 | |
| 215 | def process(self, message): |
| 216 | if message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'): |
| 217 | self.mbox.add(message) |
| 218 | return True |
| 219 | return False |
| 220 | |
| 221 | def finalize(self): |
| 222 | print 'Found %d spams. This is reliable.' % len(self.mbox) |
| 223 | print 'They were saved in %s.' % self.mbox_file |
| 224 | print 'You might check the contents of this mbox.' |
| 225 | self.mbox.close() |
| 226 | |
| 227 | #----------------------------------------------------------------------------# |
| 228 | |
| 229 | class UnsureFilter(MboxFilter): |
| 230 | |
| 231 | def initialize(self, mbox_file): |
| 232 | self.mbox_file = '%s.unsure' % mbox_file |
| 233 | self.mbox = mailbox.mbox(self.mbox_file) |
| 234 | self.mbox.clear() |
| 235 | |
| 236 | def process(self, message): |
| 237 | if message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'): |
| 238 | self.mbox.add(message) |
| 239 | return True |
| 240 | return False |
| 241 | |
| 242 | def finalize(self): |
| 243 | print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox) |
| 244 | print 'They were saved in %s.' % self.mbox_file |
| 245 | print 'You must check the contents of this mbox and feed the antispam.' |
| 246 | self.mbox.close() |
| 247 | |
| 248 | #----------------------------------------------------------------------------# |
| 249 | |
| 250 | class CheckNonSpamFilter(MboxFilter): |
| 251 | |
| 252 | def initialize(self, mbox_file): |
| 253 | self.seen = 0 |
| 254 | |
| 255 | def process(self, message): |
| 256 | if not message['X-Spam-Flag'].startswith('No, tests=bogofilter'): |
| 257 | self.seen += 1 |
| 258 | return False |
| 259 | |
| 260 | def finalize(self): |
| 261 | if self.seen > 0: |
| 262 | print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.counter |
| 263 | print 'Please investigate.' |
| 264 | else: |
| 265 | print 'All messages were either spam, or unsure, or non-spams. Good.' |
| 266 | |
| 267 | #----------------------------------------------------------------------------# |
| 268 | |
| 269 | class OutOfOfficeFilter(MboxFilter): |
| 270 | |
| 271 | def initialize(self, mbox_file): |
| 272 | self.mbox_file = '%s.ooo' % mbox_file |
| 273 | self.mbox = mailbox.mbox(self.mbox_file) |
| 274 | self.mbox.clear() |
| 275 | subject_re = [ |
| 276 | r'^Absen(t|ce)', |
| 277 | r'(est|is) absent', |
| 278 | r'^Out of (the )?office', |
| 279 | r'is out of (the )?office', |
| 280 | r'I am out of town', |
| 281 | r'automatique d\'absence', |
| 282 | r'Notification d\'absence' |
| 283 | u'RĂ©ponse automatique :', #unicode! |
| 284 | r'AutoReply', |
| 285 | ] |
| 286 | self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re)) |
| 287 | |
| 288 | def process(self, message): |
| 289 | subject = findSubject(message) |
| 290 | if subject is not None and any(regex.search(subject) for regex in self.subject_regexes): |
| 291 | self.mbox.add(message) |
| 292 | return True |
| 293 | return False |
| 294 | |
| 295 | def finalize(self): |
| 296 | print 'Found %d "out of office". This is generally reliable.' % len(self.mbox) |
| 297 | print 'They were saved in %s.' % self.mbox_file |
| 298 | print 'You may check the contents of this mbox.' |
| 299 | self.mbox.close() |
| 300 | |
| 301 | #----------------------------------------------------------------------------# |
| 302 | |
| 303 | class DeliveryStatusNotificationFilter(MboxFilter): |
| 304 | |
| 305 | def initialize(self, mbox_file): |
| 306 | self.emails = [] |
| 307 | self.mbox_file = '%s.dsn' % mbox_file |
| 308 | self.mbox = mailbox.mbox(self.mbox_file) |
| 309 | self.mbox.clear() |
| 310 | |
| 311 | def process(self, message): |
| 312 | if message.get_content_type() == 'multipart/report': |
| 313 | email = findAddressInBounce(message) |
| 314 | if email is not None: |
| 315 | self.emails.append(email) |
| 316 | self.mbox.add(message) |
| 317 | return True |
| 318 | return False |
| 319 | |
| 320 | def finalize(self): |
| 321 | print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox) |
| 322 | print 'They were saved in %s.' % self.mbox_file |
| 323 | print '' |
| 324 | print 'Here is the list of email adresses for these bounces:' |
| 325 | print '' |
| 326 | for email in self.emails: |
| 327 | print email |
| 328 | print '' |
| 329 | self.mbox.close() |
| 330 | |
| 331 | #----------------------------------------------------------------------------# |
| 332 | |
| 333 | class CatchAllFilter(MboxFilter): |
| 334 | |
| 335 | def initialize(self, mbox_file): |
| 336 | self.mbox_file = '%s.catchall' % mbox_file |
| 337 | self.mbox = mailbox.mbox(self.mbox_file) |
| 338 | self.mbox.clear() |
| 339 | |
| 340 | def process(self, message): |
| 341 | self.mbox.add(message) |
| 342 | return True |
| 343 | |
| 344 | def finalize(self): |
| 345 | if len(self.mbox) > 0: |
| 346 | print '%d messages reached the catchall.' % len(self.mbox) |
| 347 | print 'They were saved in %s.' % self.mbox_file |
| 348 | print 'You must process the contents of this mbox manually.' |
| 349 | self.mbox.close() |
| 350 | else: |
| 351 | print 'No messages reached the catchall. Nice.' |
| 352 | self.mbox.close() |
| 353 | os.unlink(self.mbox_file) |
| 354 | |
| 355 | #----------------------------------------------------------------------------# |
| 356 | |
| 357 | if __name__ == '__main__': |
| 358 | |
| 359 | if len(sys.argv) != 2: |
| 360 | print 'Usage: %s mbox' % sys.argv[0] |
| 361 | sys.exit(1) |
| 362 | |
| 363 | if not os.path.exists(sys.argv[1]): |
| 364 | print 'No such file: %s' % sys.argv[1] |
| 365 | sys.exit(1) |
| 366 | |
| 367 | processor = MboxProcessor(sys.argv[1]) |
| 368 | processor.run() |