18f0cc72689c52976fe07841d5d80c7bd8633eb6
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email
36 import mailbox
37 import os
38 import re
39 import sys
40 import time
41
42 #----------------------------------------------------------------------------#
43
44 class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
73 for f in self.filters:
74 f.finalize()
75 print(separator)
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87 #----------------------------------------------------------------------------#
88
89 class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
94
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
100
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
106
107 This is the place to display the results and close all descriptors."""
108 pass
109
110 #----------------------------------------------------------------------------#
111
112 def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
124
125 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126
127
128 def findAddressInBounce(bounce):
129 """Finds the faulty email address in a bounced email.
130
131 See RFC 1894 for more information.
132 Returns None or the email address."""
133
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
137 return None
138 # Extract the second component of the multipart/report
139 num_payloads = len(bounce.get_payload())
140 if num_payloads < 2:
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
142 return None
143 status = bounce.get_payload(1)
144 if status.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
146 return None
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads = len(status.get_payload())
149 if num_payloads < 2:
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
151 return None
152 content = status.get_payload(1)
153 if content.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
155 return None
156 # Extract the faulty email address
157 recipient_match = _recipient_re.search(content['Final-Recipient'])
158 if recipient_match is None:
159 print('! Missing final recipient.')
160 return None
161 email = recipient_match.group(1)
162 # Check the action field
163 if content['Action'].lower() != 'failed':
164 print('! Not a failed action (%s).' % content['Action'])
165 return None
166
167 status = content['Status']
168 diag_code = content['Diagnostic-Code']
169
170 # Permanent failure state
171 if int(status[:1]) == 5:
172 return email
173
174 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
175 if diag_code is not None and diag_code.startswith('X-Postfix'):
176 return email
177
178 failure_hints = [
179 "insufficient system storage",
180 "mailbox full",
181 "user unknown",
182 ]
183 if 'quota' in status.lower():
184 return email
185 if diag_code is not None:
186 ldiag_code = diag_code.lower()
187 if any(hint in ldiag_code for hint in failure_hints):
188 return email
189
190 print('! Not a permanent failure status (%s).' % status)
191 if diag_code is not None:
192 print('! Diagnostic code was: %s' % diag_code)
193 return None
194
195
196 def findAddressInPlainBounce(bounce):
197 """Finds the faulty email address in a non-RFC-1894 bounced email
198 """
199 if 'MAILER-DAEMON@' not in bounce['From'].upper():
200 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
201 return None
202 if bounce.get_content_type() != 'text/plain':
203 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
204 return None
205 subject = findSubject(bounce).lower()
206 if (subject != 'failure notice'
207 and subject != 'undeliverable message'
208 and not subject.startswith('mail delivery failed')
209 and subject != 'delivery status notification (failure)'):
210
211 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
212 return None
213
214 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
215 lines = bounce.get_payload().splitlines()[:15]
216
217 # Match:
218 # A message that you sent could not be delivered to one or more of its recipients.
219 # I'm afraid I wasn't able to deliver your message to the following addresses.
220 # The following message to <email@example.com> was undeliverable.
221 non_delivery_hints = [
222 "Delivery to the following recipient failed permanently",
223 "I wasn't able to deliver your message",
224 "> was undeliverable.",
225 "could not be delivered to",
226 "we were unable to deliver your message",
227 ]
228 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
229 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
230 print('\n'.join(lines))
231 return None
232
233 # Match:
234 # This is a permanent error; I've given up. Sorry it didn't work out.
235 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
236 permanent_error_hints = [
237 "Delivery to the following recipient failed permanently",
238 "This is a permanent error",
239 "Unknown address error",
240 "550 Requested action not taken",
241 ]
242 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
243 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
244 print('\n'.join(lines))
245 return None
246
247 # Retrieve the first occurence of <email@example.com>
248 for line in lines:
249 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
250 if match is None:
251 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
252 if match is not None:
253 email = match.group(1)
254 if email.endswith('@polytechnique.org'):
255 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
256 break
257 return email
258
259 print('! Unknown mailer-daemon message, unable to find email address:')
260 print('\n'.join(lines))
261 return None
262
263 #----------------------------------------------------------------------------#
264
265 class DirectBouncesFilter(MboxFilter):
266
267 def initialize(self, mbox_file):
268 self.seen = 0
269 self.bad_problems = 0
270 self.emails = []
271 self.mbox_file = '%s.bounced' % mbox_file
272 self.mbox = mailbox.mbox(self.mbox_file)
273 self.mbox.clear()
274
275 def process(self, message):
276 if message['X-Spam-Flag'] is None:
277 # During finalization, we will verifiy that all messages were processed
278 self.seen += 1
279 # Special case: ignore mailman notifications for the mailing-list
280 # on which the NL is forwarded
281 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
282 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
283 self.seen -= 1
284 return True
285 # Additionnal checks, just to be sure
286 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
287 or message['Subject'] != 'Undelivered Mail Returned to Sender':
288 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
289 else:
290 email = findAddressInBounce(message)
291 if email is not None:
292 self.emails.append(email)
293 self.mbox.add(message)
294 return True
295 else:
296 print('! => No email found in direct bounce, this is really bad.')
297 self.bad_problems += 1
298 return False
299
300 def finalize(self):
301 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
302 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
303 print('They were saved in %s.' % self.mbox_file)
304 if self.bad_problems:
305 print('Found %d of them that are invalid.' % self.bad_problems)
306 if self.seen != len(self.mbox) + self.bad_problems:
307 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
308 print('')
309 print('Here is the list of email adresses for these bounces:')
310 print('')
311 for email in self.emails:
312 print(email)
313 print('')
314 self.mbox.close()
315
316 #----------------------------------------------------------------------------#
317
318 class SpamFilter(MboxFilter):
319
320 def initialize(self, mbox_file):
321 self.mbox_file = '%s.spam' % mbox_file
322 self.mbox = mailbox.mbox(self.mbox_file)
323 self.mbox.clear()
324
325 def process(self, message):
326 if message['X-Spam-Flag'] is not None \
327 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
328 self.mbox.add(message)
329 return True
330 return False
331
332 def finalize(self):
333 print('Found %d spams. This is reliable.' % len(self.mbox))
334 print('They were saved in %s.' % self.mbox_file)
335 print('You might check the contents of this mbox.')
336 self.mbox.close()
337
338 #----------------------------------------------------------------------------#
339
340 class UnsureFilter(MboxFilter):
341
342 def initialize(self, mbox_file):
343 self.mbox_file = '%s.unsure' % mbox_file
344 self.mbox = mailbox.mbox(self.mbox_file)
345 self.mbox.clear()
346
347 def process(self, message):
348 if message['X-Spam-Flag'] is not None \
349 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
350 self.mbox.add(message)
351 return True
352 return False
353
354 def finalize(self):
355 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
356 print('They were saved in %s.' % self.mbox_file)
357 print('You must check the contents of this mbox and feed the antispam.')
358 self.mbox.close()
359
360 #----------------------------------------------------------------------------#
361
362 class CheckNonSpamFilter(MboxFilter):
363
364 def initialize(self, mbox_file):
365 self.seen = 0
366
367 def process(self, message):
368 if message['X-Spam-Flag'] is None \
369 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
370 self.seen += 1
371 return False
372
373 def finalize(self):
374 if self.seen > 0:
375 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
376 print('Please investigate.')
377 else:
378 print('All messages were either spam, or unsure, or non-spams. Good.')
379
380 #----------------------------------------------------------------------------#
381
382 class OutOfOfficeFilter(MboxFilter):
383
384 def initialize(self, mbox_file):
385 self.mbox_file = '%s.ooo' % mbox_file
386 self.mbox = mailbox.mbox(self.mbox_file)
387 self.mbox.clear()
388 subject_re = [
389 r'^Absen(t|ce)',
390 r'^(AUTO: )?Out of (the )?office',
391 r'^Auto( ?): ',
392 r'^Automatic reply: ',
393 r'automatique d\'absence',
394 r'AutoReply',
395 r'(est|is) absent',
396 r'I am out of town',
397 r'I am currently away',
398 r'(am|is) out of (the )?office',
399 r'Notification d\'absence',
400 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
401 ]
402 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
403
404 def process(self, message):
405 subject = findSubject(message)
406 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
407 self.mbox.add(message)
408 return True
409
410 # Some systems reply with "Re: ". Be smart here!
411 if subject is not None and subject.startswith('Re: '):
412 # Delivered-To: Autoresponder
413 if 'Autoresponder' in message.get_all('Delivered-To'):
414 self.mbox.add(message)
415 return True
416 # Parse content if it is simple enough
417 if message.get_content_type() == 'text/plain':
418 firstline = message.get_payload().splitlines()[0].lower()
419 if (' absent du bureau ' in firstline
420 or ' away from my office ' in firstline):
421 self.mbox.add(message)
422 return True
423
424 return False
425
426 def finalize(self):
427 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
428 print('They were saved in %s.' % self.mbox_file)
429 print('You may check the contents of this mbox.')
430 self.mbox.close()
431
432 #----------------------------------------------------------------------------#
433
434 class DeliveryStatusNotificationFilter(MboxFilter):
435
436 def initialize(self, mbox_file):
437 self.emails = []
438 self.mbox_file = '%s.dsn' % mbox_file
439 self.mbox = mailbox.mbox(self.mbox_file)
440 self.mbox.clear()
441 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
442 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
443 self.mbox_temp.clear()
444
445 def process(self, message):
446 # Don't modify message variable for "self.mbox.add(message)"
447 report_message = message
448 # Find real report inside attachment
449 if message.get_content_type() == 'multipart/mixed':
450 report_message = message.get_payload(0)
451
452 # Process report if its type is correct
453 if report_message.get_content_type() == 'multipart/report':
454 email = findAddressInBounce(report_message)
455 if email is not None:
456 self.emails.append(email)
457 self.mbox.add(message)
458 else:
459 print("! => Moved to temporary DSN mailbox")
460 self.mbox_temp.add(message)
461 return True
462
463 # Detect ill-formatted reports, sent as plain text email
464 if 'MAILER-DAEMON@' in message['From'].upper():
465 email = findAddressInPlainBounce(message)
466 if email is not None:
467 self.emails.append(email)
468 self.mbox.add(message)
469 return True
470 return False
471
472 def finalize(self):
473 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
474 print('They were saved in %s.' % self.mbox_file)
475 print('')
476 print('Here is the list of email adresses for these bounces:')
477 print('')
478 for email in self.emails:
479 print(email)
480 print('')
481 self.mbox.close()
482 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
483 print('They were saved in %s.' % self.mbox_temp_file)
484 self.mbox_temp.close()
485
486 #----------------------------------------------------------------------------#
487
488 class CatchAllFilter(MboxFilter):
489
490 def initialize(self, mbox_file):
491 self.mbox_file = '%s.catchall' % mbox_file
492 self.mbox = mailbox.mbox(self.mbox_file)
493 self.mbox.clear()
494
495 def process(self, message):
496 self.mbox.add(message)
497 return True
498
499 def finalize(self):
500 if len(self.mbox) > 0:
501 print('%d messages reached the catchall.' % len(self.mbox))
502 print('They were saved in %s.' % self.mbox_file)
503 print('You must process the contents of this mbox manually.')
504 self.mbox.close()
505 else:
506 print('No messages reached the catchall. Nice.')
507 self.mbox.close()
508 os.unlink(self.mbox_file)
509
510 #----------------------------------------------------------------------------#
511
512 if __name__ == '__main__':
513
514 if len(sys.argv) != 2:
515 print('Usage: %s mbox' % sys.argv[0])
516 sys.exit(1)
517
518 if not os.path.exists(sys.argv[1]):
519 print('No such file: %s' % sys.argv[1])
520 sys.exit(1)
521
522 processor = MboxProcessor(sys.argv[1])
523 processor.run()