NL bounces: introduce a findAddressInWeirdDeliveryStatus function to process weird...
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email
36 import mailbox
37 import os
38 import re
39 import sys
40 import time
41
42 #----------------------------------------------------------------------------#
43
44 class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
73 for f in self.filters:
74 f.finalize()
75 print(separator)
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87 #----------------------------------------------------------------------------#
88
89 class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
94
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
100
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
106
107 This is the place to display the results and close all descriptors."""
108 pass
109
110 #----------------------------------------------------------------------------#
111
112 def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
124
125 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
128
129
130 def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
132
133 See RFC 1894 for more information.
134 Returns None or the email address."""
135
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
139 return None
140 # Extract the second component of the multipart/report
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
144 return None
145 status = bounce.get_payload(1)
146
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status.get_content_type() == 'message/rfc822':
150 text_bounce = bounce.get_payload(0)
151 if text_bounce.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce, bounce)
153 # If it's not a text message, let's continue to the next error message
154
155 if status.get_content_type() != 'message/delivery-status':
156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
157 return None
158 # The per-message-fields don't matter here, get only the per-recipient-fields
159 num_payloads = len(status.get_payload())
160 if num_payloads < 2:
161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
162 return None
163 content = status.get_payload(1)
164 if content.get_content_type() != 'text/plain':
165 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
166 return None
167 # Extract the faulty email address
168 recipient_match = _recipient_re.search(content['Final-Recipient'])
169 if recipient_match is None:
170 # Be nice, test another regexp
171 recipient_match = _recipient_re2.search(content['Final-Recipient'])
172 if recipient_match is None:
173 print('! Missing final recipient.')
174 return None
175 email = recipient_match.group(1)
176 # Check the action field
177 if content['Action'].lower() != 'failed':
178 print('! Not a failed action (%s).' % content['Action'])
179 return None
180
181 status = content['Status']
182 diag_code = content['Diagnostic-Code']
183
184 # Permanent failure state
185 if int(status[:1]) == 5:
186 return email
187
188 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
189 if diag_code is not None and diag_code.startswith('X-Postfix'):
190 return email
191
192 failure_hints = [
193 "insufficient system storage",
194 "mailbox full",
195 "requested action aborted: local error in processing",
196 "user unknown",
197 ]
198 if 'quota' in status.lower():
199 return email
200 if diag_code is not None:
201 ldiag_code = diag_code.lower()
202 if any(hint in ldiag_code for hint in failure_hints):
203 return email
204
205 print('! Not a permanent failure status (%s).' % status)
206 if diag_code is not None:
207 print('! Diagnostic code was: %s' % diag_code)
208 return None
209
210
211 def findAddressInWeirdDeliveryStatus(message):
212 """Finds the faulty email address in the delivery-status part of an email
213
214 Unlikely to findAddressInBounce, the status does NOT follow RFC 1894, so
215 try to learn to get data nevertheless...
216 Returns None or the email address.
217 """
218 if message.get_content_type() != 'message/delivery-status':
219 print('! Not a valid weird bounce (expected message/delivery-status, found %s).' % message.get_content_type())
220 return None
221 # The per-message-fields don't matter here, get only the per-recipient-fields
222 num_payloads = len(message.get_payload())
223 if num_payloads < 2:
224 print('! Not a valid weird bounce (expected at least 2 parts, found %d).' % num_payloads)
225 return None
226 content = message.get_payload(1)
227 # The content may be missing, but interesting headers still present in the first payload...
228 if not content:
229 content = message.get_payload(0)
230 if 'Action' not in content:
231 print('! Not a valid weird bounce (unable to find content).')
232 return None
233 elif content.get_content_type() != 'text/plain':
234 print('! Not a valid weird bounce (expected text/plain, found %s).' % content.get_content_type())
235 return None
236
237 # Extract the faulty email address
238 if 'Final-Recipient' in content:
239 recipient_match = _recipient_re.search(content['Final-Recipient'])
240 if recipient_match is None:
241 # Be nice, test another regexp
242 recipient_match = _recipient_re2.search(content['Final-Recipient'])
243 if recipient_match is None:
244 print('! Unknown final recipient in weird bounce.')
245 return None
246 email = recipient_match.group(1)
247 elif 'Original-Recipient' in content:
248 recipient = content['Original-Recipient']
249 recipient_match = _recipient_re.search(recipient)
250 if recipient_match is None:
251 # Be nice, test another regexp
252 recipient_match = _recipient_re2.search(recipient)
253 if recipient_match is None:
254 recipient_match = re.match(r'<([^>]+@[^@>]+)>', recipient)
255 if recipient_match is None:
256 print('! Unknown original recipient in weird bounce.')
257 return None
258 email = recipient_match.group(1)
259 else:
260 print('! Missing recipient in weird bounce.')
261 return None
262
263 # Check the action field
264 if content['Action'].lower() != 'failed':
265 print('! Not a failed action (%s).' % content['Action'])
266 return None
267
268 status = content['Status']
269 diag_code = content['Diagnostic-Code']
270
271 # Permanent failure state
272 if status and int(status[:1]) == 5:
273 return email
274
275 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
276 if diag_code is not None and diag_code.startswith('X-Postfix'):
277 return email
278
279 failure_hints = [
280 "insufficient system storage",
281 "mailbox full",
282 "requested action aborted: local error in processing",
283 "sender address rejected",
284 "user unknown",
285 ]
286 if status and 'quota' in status.lower():
287 return email
288 if diag_code is not None:
289 ldiag_code = diag_code.lower()
290 if any(hint in ldiag_code for hint in failure_hints):
291 return email
292
293 print('! Not a permanent failure status (%s).' % status)
294 if diag_code is not None:
295 print('! Diagnostic code was: %s' % diag_code)
296 return None
297
298
299 def findAddressInPlainBounce(bounce, real_bounce=None):
300 """Finds the faulty email address in a non-RFC-1894 bounced email
301 """
302 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
303 real_bounce = real_bounce or bounce
304 if 'MAILER-DAEMON@' not in real_bounce['From'].upper():
305 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
306 return None
307 if bounce.get_content_type() != 'text/plain':
308 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
309 return None
310 subject = findSubject(real_bounce).lower()
311 known_subjects = [
312 "delivery status notification (failure)",
313 "failure notice",
314 "returned mail: see transcript for details",
315 "undeliverable message",
316 "undelivered mail returned to sender",
317 ]
318 if subject not in known_subjects and not subject.startswith('mail delivery failed'):
319 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
320 return None
321
322 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
323 lines = bounce.get_payload().splitlines()[:15]
324
325 # ALTOSPAM is a service which requires to click on a link when sending an email
326 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
327 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
328 if any("ALTOSPAM which is used by the person" in line for line in lines):
329 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
330 return None
331
332 # Match:
333 # A message that you sent could not be delivered to one or more of its recipients.
334 # I'm afraid I wasn't able to deliver your message to the following addresses.
335 # The following message to <email@example.com> was undeliverable.
336 non_delivery_hints = [
337 "Delivery to the following recipient failed permanently",
338 "I'm sorry to have to inform you that your message could not",
339 "I wasn't able to deliver your message",
340 "> was undeliverable.",
341 "could not be delivered to",
342 "we were unable to deliver your message",
343 ]
344 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
345 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
346 print('\n'.join(lines))
347 return None
348
349 # Match:
350 # This is a permanent error; I've given up. Sorry it didn't work out.
351 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
352 permanent_error_hints = [
353 "Delivery to the following recipient failed permanently",
354 "I'm sorry to have to inform you that your message could not",
355 "This is a permanent error",
356 "Unknown address error",
357 "unreachable for too long",
358 "550 Requested action not taken",
359 ]
360 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
361 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
362 print('\n'.join(lines))
363 return None
364
365 # Retrieve the first occurence of <email@example.com>
366 for line in lines:
367 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
368 if match is None:
369 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
370 if match is not None:
371 email = match.group(1)
372 if email.endswith('@polytechnique.org'):
373 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
374 break
375 return email
376
377 print('! Unknown mailer-daemon message, unable to find email address:')
378 print('\n'.join(lines))
379 return None
380
381 #----------------------------------------------------------------------------#
382
383 class DirectBouncesFilter(MboxFilter):
384
385 def initialize(self, mbox_file):
386 self.seen = 0
387 self.bad_problems = 0
388 self.emails = []
389 self.mbox_file = '%s.bounced' % mbox_file
390 self.mbox = mailbox.mbox(self.mbox_file)
391 self.mbox.clear()
392
393 def process(self, message):
394 if message['X-Spam-Flag'] is None:
395 # During finalization, we will verifiy that all messages were processed
396 self.seen += 1
397 # Special case: ignore mailman notifications for the mailing-list
398 # on which the NL is forwarded
399 if message['From'] == 'newsletter-externes-owner@polytechnique.org':
400 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
401 self.seen -= 1
402 return True
403 # Additionnal checks, just to be sure
404 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
405 or message['Subject'] != 'Undelivered Mail Returned to Sender':
406 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
407 else:
408 email = findAddressInBounce(message)
409 if email is not None:
410 self.emails.append(email)
411 self.mbox.add(message)
412 return True
413 else:
414 print('! => No email found in direct bounce, this is really bad.')
415 self.bad_problems += 1
416 return False
417
418 def finalize(self):
419 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
420 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
421 print('They were saved in %s.' % self.mbox_file)
422 if self.bad_problems:
423 print('Found %d of them that are invalid.' % self.bad_problems)
424 if self.seen != len(self.mbox) + self.bad_problems:
425 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
426 print('')
427 print('Here is the list of email adresses for these bounces:')
428 print('')
429 for email in self.emails:
430 print(email)
431 print('')
432 self.mbox.close()
433
434 #----------------------------------------------------------------------------#
435
436 class SpamFilter(MboxFilter):
437
438 def initialize(self, mbox_file):
439 self.mbox_file = '%s.spam' % mbox_file
440 self.mbox = mailbox.mbox(self.mbox_file)
441 self.mbox.clear()
442
443 def process(self, message):
444 if message['X-Spam-Flag'] is not None \
445 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
446 self.mbox.add(message)
447 return True
448 return False
449
450 def finalize(self):
451 print('Found %d spams. This is reliable.' % len(self.mbox))
452 print('They were saved in %s.' % self.mbox_file)
453 print('You might check the contents of this mbox.')
454 self.mbox.close()
455
456 #----------------------------------------------------------------------------#
457
458 class UnsureFilter(MboxFilter):
459
460 def initialize(self, mbox_file):
461 self.mbox_file = '%s.unsure' % mbox_file
462 self.mbox = mailbox.mbox(self.mbox_file)
463 self.mbox.clear()
464
465 def process(self, message):
466 if message['X-Spam-Flag'] is not None \
467 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
468 self.mbox.add(message)
469 return True
470 return False
471
472 def finalize(self):
473 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
474 print('They were saved in %s.' % self.mbox_file)
475 print('You must check the contents of this mbox and feed the antispam.')
476 self.mbox.close()
477
478 #----------------------------------------------------------------------------#
479
480 class CheckNonSpamFilter(MboxFilter):
481
482 def initialize(self, mbox_file):
483 self.seen = 0
484
485 def process(self, message):
486 if message['X-Spam-Flag'] is None \
487 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
488 self.seen += 1
489 return False
490
491 def finalize(self):
492 if self.seen > 0:
493 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
494 print('Please investigate.')
495 else:
496 print('All messages were either spam, or unsure, or non-spams. Good.')
497
498 #----------------------------------------------------------------------------#
499
500 class OutOfOfficeFilter(MboxFilter):
501
502 def initialize(self, mbox_file):
503 self.mbox_file = '%s.ooo' % mbox_file
504 self.mbox = mailbox.mbox(self.mbox_file)
505 self.mbox.clear()
506 subject_re = [
507 r'^Absen(t|ce)',
508 r'^(AUTO: )?Out of (the )?office',
509 r'^Auto( ?): ',
510 r'^AutoRe( ?):',
511 r'^Automatic reply: ',
512 r'automatique d\'absence',
513 r'AutoReply',
514 r'(est|is) absent',
515 r'^En dehors du bureau',
516 r'I am out of town',
517 r'I am currently away',
518 r'(am|is) out of (the )?office',
519 r'Notification d\'absence',
520 r'^Out of email reach',
521 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
522 r'^Respuesta de Estoy ausente:',
523 ]
524 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
525
526 def process(self, message):
527 subject = findSubject(message)
528 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
529 self.mbox.add(message)
530 return True
531
532 # Some systems reply with "Re: ". Be smart here!
533 if subject is not None and subject.startswith('Re: '):
534 # Delivered-To: Autoresponder
535 if 'Autoresponder' in message.get_all('Delivered-To'):
536 self.mbox.add(message)
537 return True
538 # Parse content if it is simple enough
539 if message.get_content_type() == 'text/plain':
540 firstline = message.get_payload().splitlines()[0].lower()
541 if (' absent du bureau ' in firstline
542 or ' away from my office ' in firstline):
543 self.mbox.add(message)
544 return True
545
546 return False
547
548 def finalize(self):
549 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
550 print('They were saved in %s.' % self.mbox_file)
551 print('You may check the contents of this mbox.')
552 self.mbox.close()
553
554 #----------------------------------------------------------------------------#
555
556 class DeliveryStatusNotificationFilter(MboxFilter):
557
558 def initialize(self, mbox_file):
559 self.emails = []
560 self.mbox_file = '%s.dsn' % mbox_file
561 self.mbox = mailbox.mbox(self.mbox_file)
562 self.mbox.clear()
563 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
564 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
565 self.mbox_temp.clear()
566
567 def process(self, message):
568 # Don't modify message variable for "self.mbox.add(message)"
569 report_message = message
570 # Find real report inside attachment
571 if message.get_content_type() == 'multipart/mixed':
572 # Some MTA confuse multipart/mixed with multipart/report
573 # Let's try to find a report!
574 if len(message.get_payload()) >= 2:
575 try_status = message.get_payload(1)
576 if try_status.get_content_type() == 'message/delivery-status':
577 # The world would be a nice place if delivery-status were
578 # formatted as expected...
579 email = findAddressInWeirdDeliveryStatus(try_status)
580 if email is not None:
581 self.emails.append(email)
582 self.mbox.add(message)
583 return True
584 try_status = None
585 report_message = message.get_payload(0)
586
587 # Process report if its type is correct
588 if report_message.get_content_type() == 'multipart/report':
589 email = findAddressInBounce(report_message)
590 if email is not None:
591 self.emails.append(email)
592 self.mbox.add(message)
593 else:
594 print("! => Moved to temporary DSN mailbox")
595 self.mbox_temp.add(message)
596 return True
597
598 # Detect ill-formatted reports, sent as plain text email
599 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
600 email = findAddressInPlainBounce(report_message)
601 if email is not None:
602 self.emails.append(email)
603 self.mbox.add(message)
604 return True
605 return False
606
607 def finalize(self):
608 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
609 print('They were saved in %s.' % self.mbox_file)
610 print('')
611 print('Here is the list of email adresses for these bounces:')
612 print('')
613 for email in self.emails:
614 print(email)
615 print('')
616 self.mbox.close()
617 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
618 print('They were saved in %s.' % self.mbox_temp_file)
619 self.mbox_temp.close()
620
621 #----------------------------------------------------------------------------#
622
623 class CatchAllFilter(MboxFilter):
624
625 def initialize(self, mbox_file):
626 self.mbox_file = '%s.catchall' % mbox_file
627 self.mbox = mailbox.mbox(self.mbox_file)
628 self.mbox.clear()
629
630 def process(self, message):
631 self.mbox.add(message)
632 return True
633
634 def finalize(self):
635 if len(self.mbox) > 0:
636 print('%d messages reached the catchall.' % len(self.mbox))
637 print('They were saved in %s.' % self.mbox_file)
638 print('You must process the contents of this mbox manually.')
639 self.mbox.close()
640 else:
641 print('No messages reached the catchall. Nice.')
642 self.mbox.close()
643 os.unlink(self.mbox_file)
644
645 #----------------------------------------------------------------------------#
646
647 if __name__ == '__main__':
648
649 if len(sys.argv) != 2:
650 print('Usage: %s mbox' % sys.argv[0])
651 sys.exit(1)
652
653 if not os.path.exists(sys.argv[1]):
654 print('No such file: %s' % sys.argv[1])
655 sys.exit(1)
656
657 processor = MboxProcessor(sys.argv[1])
658 processor.run()