3aaa66054294f363bd474f35a81bb43d17780830
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2014 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email
36 import mailbox
37 import os
38 import re
39 import sys
40 import time
41
42 #----------------------------------------------------------------------------#
43
44 class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
73 for f in self.filters:
74 f.finalize()
75 print(separator)
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87 #----------------------------------------------------------------------------#
88
89 class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
94
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
100
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
106
107 This is the place to display the results and close all descriptors."""
108 pass
109
110 #----------------------------------------------------------------------------#
111
112 def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
124
125 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
128
129
130 def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
132
133 See RFC 1894 for more information.
134 Returns None or the email address."""
135
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
139 return None
140 # Extract the second component of the multipart/report
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
144 return None
145 status = bounce.get_payload(1)
146
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status.get_content_type() == 'message/rfc822':
150 text_bounce = bounce.get_payload(0)
151 if text_bounce.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce, bounce)
153 # If it's not a text message, let's continue to the next error message
154
155 if status.get_content_type() != 'message/delivery-status':
156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
157 return None
158 # The per-message-fields don't matter here, get only the per-recipient-fields
159 num_payloads = len(status.get_payload())
160 if num_payloads < 2:
161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
162 return None
163 content = status.get_payload(1)
164 if content.get_content_type() != 'text/plain':
165 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
166 return None
167 # Extract the faulty email address
168 # Some MTA don't set Final-Recipient but use Remote-Recipient instead
169 if 'Final-Recipient' in content:
170 final_recipient = content['Final-Recipient']
171 elif 'Remote-Recipient' in content:
172 final_recipient = content['Remote-Recipient']
173 else:
174 print('! Not a valid bounce (no Final-Recipient).')
175 return None
176 recipient_match = _recipient_re.search(final_recipient)
177 if recipient_match is None:
178 # Be nice, test another regexp
179 recipient_match = _recipient_re2.search(final_recipient)
180 if recipient_match is None:
181 print('! Missing final recipient.')
182 return None
183 email = recipient_match.group(1)
184 # Check the action field
185 if content['Action'].lower().strip() != 'failed':
186 print('! Not a failed action (%s).' % content['Action'])
187 return None
188
189 status = content['Status']
190 diag_code = content['Diagnostic-Code']
191
192 # Permanent failure state
193 if int(status[:1]) == 5:
194 return email
195
196 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
197 if diag_code is not None and diag_code.startswith('X-Postfix'):
198 return email
199
200 failure_hints = [
201 "insufficient system storage",
202 "mailbox full",
203 "mailbox recipient does not have a mailbox database",
204 "over quota",
205 "requested action aborted: local error in processing",
206 "user unknown",
207 ]
208 if 'quota' in status.lower():
209 return email
210 if diag_code is not None:
211 ldiag_code = diag_code.lower()
212 if any(hint in ldiag_code for hint in failure_hints):
213 return email
214
215 print('! Not a permanent failure status (%s).' % status)
216 if diag_code is not None:
217 print('! Diagnostic code was: %s' % diag_code)
218 return None
219
220
221 def findAddressInWeirdDeliveryStatus(message):
222 """Finds the faulty email address in the delivery-status part of an email
223
224 Unlikely to findAddressInBounce, the status does NOT follow RFC 1894, so
225 try to learn to get data nevertheless...
226 Returns None or the email address.
227 """
228 if message.get_content_type() != 'message/delivery-status':
229 print('! Not a valid weird bounce (expected message/delivery-status, found %s).' % message.get_content_type())
230 return None
231 # The per-message-fields don't matter here, get only the per-recipient-fields
232 num_payloads = len(message.get_payload())
233 if num_payloads < 2:
234 print('! Not a valid weird bounce (expected at least 2 parts, found %d).' % num_payloads)
235 return None
236 content = message.get_payload(1)
237 # The content may be missing, but interesting headers still present in the first payload...
238 if not content:
239 content = message.get_payload(0)
240 if 'Action' not in content:
241 print('! Not a valid weird bounce (unable to find content).')
242 return None
243 elif content.get_content_type() != 'text/plain':
244 print('! Not a valid weird bounce (expected text/plain, found %s).' % content.get_content_type())
245 return None
246
247 # Extract the faulty email address
248 if 'Final-Recipient' in content:
249 recipient_match = _recipient_re.search(content['Final-Recipient'])
250 if recipient_match is None:
251 # Be nice, test another regexp
252 recipient_match = _recipient_re2.search(content['Final-Recipient'])
253 if recipient_match is None:
254 print('! Unknown final recipient in weird bounce.')
255 return None
256 email = recipient_match.group(1)
257 elif 'Original-Recipient' in content:
258 recipient = content['Original-Recipient']
259 recipient_match = _recipient_re.search(recipient)
260 if recipient_match is None:
261 # Be nice, test another regexp
262 recipient_match = _recipient_re2.search(recipient)
263 if recipient_match is None:
264 recipient_match = re.match(r'<([^>]+@[^@>]+)>', recipient)
265 if recipient_match is None:
266 print('! Unknown original recipient in weird bounce.')
267 return None
268 email = recipient_match.group(1)
269 else:
270 print('! Missing recipient in weird bounce.')
271 return None
272
273 # Check the action field
274 if content['Action'].lower() != 'failed':
275 print('! Not a failed action (%s).' % content['Action'])
276 return None
277
278 status = content['Status']
279 diag_code = content['Diagnostic-Code']
280
281 # Permanent failure state
282 if status and int(status[:1]) == 5:
283 return email
284
285 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
286 if diag_code is not None and diag_code.startswith('X-Postfix'):
287 return email
288
289 failure_hints = [
290 "insufficient system storage",
291 "mailbox full",
292 "requested action aborted: local error in processing",
293 "sender address rejected",
294 "user unknown",
295 ]
296 if status and 'quota' in status.lower():
297 return email
298 if diag_code is not None:
299 ldiag_code = diag_code.lower()
300 if any(hint in ldiag_code for hint in failure_hints):
301 return email
302
303 print('! Not a permanent failure status (%s).' % status)
304 if diag_code is not None:
305 print('! Diagnostic code was: %s' % diag_code)
306 return None
307
308
309 def findAddressInPlainBounce(bounce, real_bounce=None):
310 """Finds the faulty email address in a non-RFC-1894 bounced email
311 """
312 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
313 real_bounce = real_bounce or bounce
314 lower_from = real_bounce['From'].lower()
315 if 'mailer-daemon@' not in lower_from and 'postmaster' not in lower_from:
316 print('! Not a valid plain bounce (expected from MAILER-DAEMON or postmaster, found %s).' % bounce['From'])
317 return None
318 if bounce.get_content_type() != 'text/plain':
319 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
320 return None
321 subject = findSubject(real_bounce).lower()
322 known_subjects = [
323 "delivery status notification (failure)",
324 "failure notice",
325 "mail delivery failure",
326 "returned mail: see transcript for details",
327 "undeliverable message",
328 "undelivered mail returned to sender",
329 ]
330 if subject not in known_subjects and not subject.startswith('mail delivery failed'):
331 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
332 return None
333
334 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
335 lines = bounce.get_payload().splitlines()[:15]
336
337 # ALTOSPAM is a service which requires to click on a link when sending an email
338 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
339 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
340 if any("ALTOSPAM which is used by the person" in line for line in lines):
341 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
342 return None
343
344 # Match:
345 # A message that you sent could not be delivered to one or more of its recipients.
346 # I'm afraid I wasn't able to deliver your message to the following addresses.
347 # The following message to <email@example.com> was undeliverable.
348 non_delivery_hints = [
349 "could not be delivered to",
350 "Delivery to the following recipient failed permanently",
351 "I'm sorry to have to inform you that your message could not",
352 "I wasn't able to deliver your message",
353 "try to send your message again at a later time",
354 "> was undeliverable.",
355 "we were unable to deliver your message",
356 ]
357 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
358 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
359 print('\n'.join(lines))
360 return None
361
362 # Match:
363 # This is a permanent error; I've given up. Sorry it didn't work out.
364 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
365 permanent_error_hints = [
366 "Delivery to the following recipient failed permanently",
367 "failed due to an unavailable mailbox",
368 "I'm sorry to have to inform you that your message could not",
369 "This is a permanent error",
370 "Unknown address error",
371 "unreachable for too long",
372 "550 Requested action not taken",
373 ]
374 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
375 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
376 print('\n'.join(lines))
377 return None
378
379 # Retrieve the first occurence of <email@example.com>
380 for line in lines:
381 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
382 if match is None:
383 match = re.match(r'^\s*"?([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)"?\s*$', line)
384 if match is not None:
385 email = match.group(1)
386 if email.endswith('@polytechnique.org'):
387 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
388 break
389 return email
390
391 print('! Unknown mailer-daemon message, unable to find email address:')
392 print('\n'.join(lines))
393 return None
394
395 #----------------------------------------------------------------------------#
396
397 class DirectBouncesFilter(MboxFilter):
398
399 def initialize(self, mbox_file):
400 self.seen = 0
401 self.bad_problems = 0
402 self.emails = []
403 self.mbox_file = '%s.bounced' % mbox_file
404 self.mbox = mailbox.mbox(self.mbox_file)
405 self.mbox.clear()
406
407 def process(self, message):
408 if message['X-Spam-Flag'] is None:
409 # During finalization, we will verifiy that all messages were processed
410 self.seen += 1
411 # Special case: ignore mailman notifications for the mailing-list
412 # on which the NL is forwarded
413 if message['From'] == 'newsletter-externes-owner@polytechnique.org':
414 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
415 self.seen -= 1
416 return True
417 # Additionnal checks, just to be sure
418 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
419 or message['Subject'] != 'Undelivered Mail Returned to Sender':
420 print('! Not an usual direct bounce (From=%r, Subject=%r).' % (message['From'], message['Subject']))
421 else:
422 email = findAddressInBounce(message)
423 if email is not None:
424 self.emails.append(email)
425 self.mbox.add(message)
426 return True
427 else:
428 print('! => No email found in direct bounce, this is really bad.')
429 self.bad_problems += 1
430 return False
431
432 def finalize(self):
433 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
434 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
435 print('They were saved in %s.' % self.mbox_file)
436 if self.bad_problems:
437 print('Found %d of them that are invalid.' % self.bad_problems)
438 if self.seen != len(self.mbox) + self.bad_problems:
439 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
440 print('')
441 print('Here is the list of email adresses for these bounces:')
442 print('')
443 for email in self.emails:
444 print(email)
445 print('')
446 self.mbox.close()
447
448 #----------------------------------------------------------------------------#
449
450 class SpamFilter(MboxFilter):
451
452 def initialize(self, mbox_file):
453 self.mbox_file = '%s.spam' % mbox_file
454 self.mbox = mailbox.mbox(self.mbox_file)
455 self.mbox.clear()
456
457 def process(self, message):
458 if message['X-Spam-Flag'] is not None \
459 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
460 self.mbox.add(message)
461 return True
462 return False
463
464 def finalize(self):
465 print('Found %d spams. This is reliable.' % len(self.mbox))
466 print('They were saved in %s.' % self.mbox_file)
467 print('You might check the contents of this mbox.')
468 self.mbox.close()
469
470 #----------------------------------------------------------------------------#
471
472 class UnsureFilter(MboxFilter):
473
474 def initialize(self, mbox_file):
475 self.mbox_file = '%s.unsure' % mbox_file
476 self.mbox = mailbox.mbox(self.mbox_file)
477 self.mbox.clear()
478
479 def process(self, message):
480 if message['X-Spam-Flag'] is not None \
481 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
482 self.mbox.add(message)
483 return True
484 return False
485
486 def finalize(self):
487 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
488 print('They were saved in %s.' % self.mbox_file)
489 print('You must check the contents of this mbox and feed the antispam.')
490 self.mbox.close()
491
492 #----------------------------------------------------------------------------#
493
494 class CheckNonSpamFilter(MboxFilter):
495
496 def initialize(self, mbox_file):
497 self.seen = 0
498
499 def process(self, message):
500 if message['X-Spam-Flag'] is None \
501 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
502 self.seen += 1
503 return False
504
505 def finalize(self):
506 if self.seen > 0:
507 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
508 print('Please investigate.')
509 else:
510 print('All messages were either spam, or unsure, or non-spams. Good.')
511
512 #----------------------------------------------------------------------------#
513
514 class OutOfOfficeFilter(MboxFilter):
515
516 def initialize(self, mbox_file):
517 self.mbox_file = '%s.ooo' % mbox_file
518 self.mbox = mailbox.mbox(self.mbox_file)
519 self.mbox.clear()
520 subject_re = [
521 r'^Absen(t|ce)',
522 r'^(AUTO: )?Out of (the )?office',
523 r'^Auto( ?): ',
524 r'^AutoRe( ?):',
525 r'^Automatic reply: ',
526 r'automatique d\'absence',
527 r'AutoReply',
528 r'(est|is) absent',
529 r'^En dehors du bureau',
530 r'I am out of town',
531 r'I am currently away',
532 r'(am|is) out of (the )?office',
533 r'Notification d\'absence',
534 r'^Out of email reach',
535 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
536 r'^Respuesta de Estoy ausente:',
537 ]
538 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
539
540 def process(self, message):
541 subject = findSubject(message)
542 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
543 self.mbox.add(message)
544 return True
545
546 # Some systems reply with "Re: ". Be smart here!
547 if subject is not None and subject.startswith('Re: '):
548 # Delivered-To: Autoresponder
549 if 'Autoresponder' in message.get_all('Delivered-To'):
550 self.mbox.add(message)
551 return True
552 # Parse content if it is simple enough
553 if message.get_content_type() == 'text/plain':
554 firstline = message.get_payload().splitlines()[0].lower()
555 if (' absent du bureau ' in firstline
556 or ' away from my office ' in firstline):
557 self.mbox.add(message)
558 return True
559
560 return False
561
562 def finalize(self):
563 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
564 print('They were saved in %s.' % self.mbox_file)
565 print('You may check the contents of this mbox.')
566 self.mbox.close()
567
568 #----------------------------------------------------------------------------#
569
570 class DeliveryStatusNotificationFilter(MboxFilter):
571
572 def initialize(self, mbox_file):
573 self.emails = []
574 self.mbox_file = '%s.dsn' % mbox_file
575 self.mbox = mailbox.mbox(self.mbox_file)
576 self.mbox.clear()
577 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
578 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
579 self.mbox_temp.clear()
580
581 def process(self, message):
582 # Don't modify message variable for "self.mbox.add(message)"
583 report_message = message
584 # Find real report inside attachment
585 if message.get_content_type() == 'multipart/mixed':
586 # Some MTA confuse multipart/mixed with multipart/report
587 # Let's try to find a report!
588 if len(message.get_payload()) >= 2:
589 try_status = message.get_payload(1)
590 if try_status.get_content_type() == 'message/delivery-status':
591 # The world would be a nice place if delivery-status were
592 # formatted as expected...
593 email = findAddressInWeirdDeliveryStatus(try_status)
594 if email is not None:
595 self.emails.append(email)
596 self.mbox.add(message)
597 return True
598 try_status = None
599 report_message = message.get_payload(0)
600
601 # Process report if its type is correct
602 if report_message.get_content_type() == 'multipart/report':
603 email = findAddressInBounce(report_message)
604 if email is not None:
605 self.emails.append(email)
606 self.mbox.add(message)
607 else:
608 print("! => Moved to temporary DSN mailbox")
609 self.mbox_temp.add(message)
610 return True
611
612 # Detect ill-formatted reports, sent as plain text email
613 if report_message.get_content_type() == 'text/plain' and (
614 'MAILER-DAEMON@' in message.get('From', '').upper() or
615 'mail delivery failure' == message.get('Subject', '').lower()
616 ):
617 email = findAddressInPlainBounce(report_message)
618 if email is not None:
619 self.emails.append(email)
620 self.mbox.add(message)
621 return True
622 return False
623
624 def finalize(self):
625 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
626 print('They were saved in %s.' % self.mbox_file)
627 print('')
628 print('Here is the list of email adresses for these bounces:')
629 print('')
630 for email in self.emails:
631 print(email)
632 print('')
633 self.mbox.close()
634 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
635 print('They were saved in %s.' % self.mbox_temp_file)
636 self.mbox_temp.close()
637
638 #----------------------------------------------------------------------------#
639
640 class CatchAllFilter(MboxFilter):
641
642 def initialize(self, mbox_file):
643 self.mbox_file = '%s.catchall' % mbox_file
644 self.mbox = mailbox.mbox(self.mbox_file)
645 self.mbox.clear()
646
647 def process(self, message):
648 self.mbox.add(message)
649 return True
650
651 def finalize(self):
652 if len(self.mbox) > 0:
653 print('%d messages reached the catchall.' % len(self.mbox))
654 print('They were saved in %s.' % self.mbox_file)
655 print('You must process the contents of this mbox manually.')
656 self.mbox.close()
657 else:
658 print('No messages reached the catchall. Nice.')
659 self.mbox.close()
660 os.unlink(self.mbox_file)
661
662 #----------------------------------------------------------------------------#
663
664 if __name__ == '__main__':
665
666 if len(sys.argv) != 2:
667 print('Usage: %s mbox' % sys.argv[0])
668 sys.exit(1)
669
670 if not os.path.exists(sys.argv[1]):
671 print('No such file: %s' % sys.argv[1])
672 sys.exit(1)
673
674 processor = MboxProcessor(sys.argv[1])
675 processor.run()