NL bounces: add some message hints
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
8c9c7d77
NI
126# Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127_recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
58e64caf 128
6208fd26 129
58e64caf
AA
130def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
6208fd26 132
58e64caf
AA
133 See RFC 1894 for more information.
134 Returns None or the email address."""
6208fd26 135
58e64caf
AA
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
6208fd26 138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
139 return None
140 # Extract the second component of the multipart/report
aa6c6ed4
AA
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
6208fd26 143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
144 return None
145 status = bounce.get_payload(1)
0cec3fee
NI
146
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status.get_content_type() == 'message/rfc822':
150 text_bounce = bounce.get_payload(0)
151 if text_bounce.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce, bounce)
153 # If it's not a text message, let's continue to the next error message
154
58e64caf 155 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
157 return None
158 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
159 num_payloads = len(status.get_payload())
160 if num_payloads < 2:
6208fd26 161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
162 return None
163 content = status.get_payload(1)
164 if content.get_content_type() != 'text/plain':
4aad6c9c 165 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
166 return None
167 # Extract the faulty email address
168 recipient_match = _recipient_re.search(content['Final-Recipient'])
169 if recipient_match is None:
8c9c7d77
NI
170 # Be nice, test another regexp
171 recipient_match = _recipient_re2.search(content['Final-Recipient'])
172 if recipient_match is None:
173 print('! Missing final recipient.')
174 return None
58e64caf
AA
175 email = recipient_match.group(1)
176 # Check the action field
4aad6c9c 177 if content['Action'].lower() != 'failed':
6208fd26 178 print('! Not a failed action (%s).' % content['Action'])
58e64caf 179 return None
29c6ffa5
NI
180
181 status = content['Status']
182 diag_code = content['Diagnostic-Code']
183
184 # Permanent failure state
185 if int(status[:1]) == 5:
186 return email
187
58e64caf 188 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
29c6ffa5
NI
189 if diag_code is not None and diag_code.startswith('X-Postfix'):
190 return email
191
192 failure_hints = [
193 "insufficient system storage",
194 "mailbox full",
95f9eb9c 195 "requested action aborted: local error in processing",
29c6ffa5
NI
196 "user unknown",
197 ]
198 if 'quota' in status.lower():
199 return email
200 if diag_code is not None:
201 ldiag_code = diag_code.lower()
202 if any(hint in ldiag_code for hint in failure_hints):
203 return email
204
205 print('! Not a permanent failure status (%s).' % status)
206 if diag_code is not None:
207 print('! Diagnostic code was: %s' % diag_code)
208 return None
58e64caf 209
15f4834d 210
0cec3fee 211def findAddressInPlainBounce(bounce, real_bounce=None):
15f4834d
NI
212 """Finds the faulty email address in a non-RFC-1894 bounced email
213 """
0cec3fee
NI
214 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
215 real_bounce = real_bounce or bounce
216 if 'MAILER-DAEMON@' not in real_bounce['From'].upper():
15f4834d
NI
217 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
218 return None
219 if bounce.get_content_type() != 'text/plain':
220 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
221 return None
0cec3fee 222 subject = findSubject(real_bounce).lower()
fa7bc030
NI
223 known_subjects = [
224 "delivery status notification (failure)",
225 "failure notice",
226 "returned mail: see transcript for details",
227 "undeliverable message",
228 "undelivered mail returned to sender",
229 ]
230 if subject not in known_subjects and not subject.startswith('mail delivery failed'):
15f4834d
NI
231 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
232 return None
233
234 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
235 lines = bounce.get_payload().splitlines()[:15]
236
222984e4
NI
237 # ALTOSPAM is a service which requires to click on a link when sending an email
238 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
239 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
240 if any("ALTOSPAM which is used by the person" in line for line in lines):
241 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
242 return None
243
15f4834d
NI
244 # Match:
245 # A message that you sent could not be delivered to one or more of its recipients.
246 # I'm afraid I wasn't able to deliver your message to the following addresses.
247 # The following message to <email@example.com> was undeliverable.
248 non_delivery_hints = [
249 "Delivery to the following recipient failed permanently",
fa7bc030 250 "I'm sorry to have to inform you that your message could not",
15f4834d
NI
251 "I wasn't able to deliver your message",
252 "> was undeliverable.",
253 "could not be delivered to",
254 "we were unable to deliver your message",
255 ]
256 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
257 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
258 print('\n'.join(lines))
259 return None
260
261 # Match:
262 # This is a permanent error; I've given up. Sorry it didn't work out.
263 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
264 permanent_error_hints = [
265 "Delivery to the following recipient failed permanently",
fa7bc030 266 "I'm sorry to have to inform you that your message could not",
15f4834d
NI
267 "This is a permanent error",
268 "Unknown address error",
95f9eb9c 269 "unreachable for too long",
15f4834d
NI
270 "550 Requested action not taken",
271 ]
272 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
273 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
274 print('\n'.join(lines))
275 return None
276
277 # Retrieve the first occurence of <email@example.com>
278 for line in lines:
279 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
280 if match is None:
281 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
282 if match is not None:
283 email = match.group(1)
284 if email.endswith('@polytechnique.org'):
285 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
286 break
287 return email
288
289 print('! Unknown mailer-daemon message, unable to find email address:')
290 print('\n'.join(lines))
291 return None
292
58e64caf
AA
293#----------------------------------------------------------------------------#
294
295class DirectBouncesFilter(MboxFilter):
296
297 def initialize(self, mbox_file):
298 self.seen = 0
6208fd26 299 self.bad_problems = 0
58e64caf
AA
300 self.emails = []
301 self.mbox_file = '%s.bounced' % mbox_file
302 self.mbox = mailbox.mbox(self.mbox_file)
303 self.mbox.clear()
304
305 def process(self, message):
306 if message['X-Spam-Flag'] is None:
307 # During finalization, we will verifiy that all messages were processed
308 self.seen += 1
e0c82ac8
AA
309 # Special case: ignore mailman notifications for the mailing-list
310 # on which the NL is forwarded
6208fd26
NI
311 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
312 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
313 self.seen -= 1
314 return True
58e64caf 315 # Additionnal checks, just to be sure
e0c82ac8 316 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 317 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 318 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
319 else:
320 email = findAddressInBounce(message)
321 if email is not None:
322 self.emails.append(email)
323 self.mbox.add(message)
324 return True
325 else:
6208fd26
NI
326 print('! => No email found in direct bounce, this is really bad.')
327 self.bad_problems += 1
58e64caf
AA
328 return False
329
330 def finalize(self):
6208fd26
NI
331 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
332 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
333 print('They were saved in %s.' % self.mbox_file)
334 if self.bad_problems:
335 print('Found %d of them that are invalid.' % self.bad_problems)
336 if self.seen != len(self.mbox) + self.bad_problems:
337 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
338 print('')
339 print('Here is the list of email adresses for these bounces:')
340 print('')
58e64caf 341 for email in self.emails:
6208fd26
NI
342 print(email)
343 print('')
58e64caf
AA
344 self.mbox.close()
345
346#----------------------------------------------------------------------------#
347
348class SpamFilter(MboxFilter):
349
350 def initialize(self, mbox_file):
351 self.mbox_file = '%s.spam' % mbox_file
352 self.mbox = mailbox.mbox(self.mbox_file)
353 self.mbox.clear()
354
355 def process(self, message):
e0c82ac8
AA
356 if message['X-Spam-Flag'] is not None \
357 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
358 self.mbox.add(message)
359 return True
360 return False
361
362 def finalize(self):
6208fd26
NI
363 print('Found %d spams. This is reliable.' % len(self.mbox))
364 print('They were saved in %s.' % self.mbox_file)
365 print('You might check the contents of this mbox.')
58e64caf
AA
366 self.mbox.close()
367
368#----------------------------------------------------------------------------#
369
370class UnsureFilter(MboxFilter):
371
372 def initialize(self, mbox_file):
373 self.mbox_file = '%s.unsure' % mbox_file
374 self.mbox = mailbox.mbox(self.mbox_file)
375 self.mbox.clear()
376
377 def process(self, message):
e0c82ac8
AA
378 if message['X-Spam-Flag'] is not None \
379 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
380 self.mbox.add(message)
381 return True
382 return False
383
384 def finalize(self):
6208fd26
NI
385 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
386 print('They were saved in %s.' % self.mbox_file)
387 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
388 self.mbox.close()
389
390#----------------------------------------------------------------------------#
391
392class CheckNonSpamFilter(MboxFilter):
393
394 def initialize(self, mbox_file):
395 self.seen = 0
396
397 def process(self, message):
e0c82ac8
AA
398 if message['X-Spam-Flag'] is None \
399 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
400 self.seen += 1
401 return False
402
403 def finalize(self):
404 if self.seen > 0:
6208fd26
NI
405 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
406 print('Please investigate.')
58e64caf 407 else:
6208fd26 408 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
409
410#----------------------------------------------------------------------------#
411
412class OutOfOfficeFilter(MboxFilter):
413
414 def initialize(self, mbox_file):
415 self.mbox_file = '%s.ooo' % mbox_file
416 self.mbox = mailbox.mbox(self.mbox_file)
417 self.mbox.clear()
418 subject_re = [
419 r'^Absen(t|ce)',
6208fd26 420 r'^(AUTO: )?Out of (the )?office',
fc643049 421 r'^Auto( ?): ',
d0ce063e 422 r'^AutoRe( ?):',
6208fd26 423 r'^Automatic reply: ',
aa6c6ed4 424 r'automatique d\'absence',
aa6c6ed4 425 r'AutoReply',
6208fd26 426 r'(est|is) absent',
95f9eb9c 427 r'^En dehors du bureau',
6208fd26
NI
428 r'I am out of town',
429 r'I am currently away',
fc643049 430 r'(am|is) out of (the )?office',
6208fd26 431 r'Notification d\'absence',
95f9eb9c 432 r'^Out of email reach',
6208fd26 433 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
95f9eb9c 434 r'^Respuesta de Estoy ausente:',
58e64caf 435 ]
6208fd26 436 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
437
438 def process(self, message):
439 subject = findSubject(message)
440 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
441 self.mbox.add(message)
442 return True
fc643049
NI
443
444 # Some systems reply with "Re: ". Be smart here!
445 if subject is not None and subject.startswith('Re: '):
446 # Delivered-To: Autoresponder
447 if 'Autoresponder' in message.get_all('Delivered-To'):
448 self.mbox.add(message)
449 return True
450 # Parse content if it is simple enough
451 if message.get_content_type() == 'text/plain':
452 firstline = message.get_payload().splitlines()[0].lower()
453 if (' absent du bureau ' in firstline
454 or ' away from my office ' in firstline):
455 self.mbox.add(message)
456 return True
457
58e64caf
AA
458 return False
459
460 def finalize(self):
6208fd26
NI
461 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
462 print('They were saved in %s.' % self.mbox_file)
463 print('You may check the contents of this mbox.')
58e64caf
AA
464 self.mbox.close()
465
466#----------------------------------------------------------------------------#
467
468class DeliveryStatusNotificationFilter(MboxFilter):
469
470 def initialize(self, mbox_file):
471 self.emails = []
472 self.mbox_file = '%s.dsn' % mbox_file
473 self.mbox = mailbox.mbox(self.mbox_file)
474 self.mbox.clear()
6208fd26
NI
475 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
476 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
477 self.mbox_temp.clear()
58e64caf
AA
478
479 def process(self, message):
15f4834d
NI
480 # Don't modify message variable for "self.mbox.add(message)"
481 report_message = message
482 # Find real report inside attachment
483 if message.get_content_type() == 'multipart/mixed':
484 report_message = message.get_payload(0)
485
486 # Process report if its type is correct
487 if report_message.get_content_type() == 'multipart/report':
488 email = findAddressInBounce(report_message)
58e64caf
AA
489 if email is not None:
490 self.emails.append(email)
491 self.mbox.add(message)
6208fd26
NI
492 else:
493 print("! => Moved to temporary DSN mailbox")
494 self.mbox_temp.add(message)
15f4834d
NI
495 return True
496
497 # Detect ill-formatted reports, sent as plain text email
4cfc54b4
NI
498 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
499 email = findAddressInPlainBounce(report_message)
15f4834d
NI
500 if email is not None:
501 self.emails.append(email)
502 self.mbox.add(message)
6208fd26 503 return True
58e64caf
AA
504 return False
505
506 def finalize(self):
6208fd26
NI
507 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
508 print('They were saved in %s.' % self.mbox_file)
509 print('')
510 print('Here is the list of email adresses for these bounces:')
511 print('')
58e64caf 512 for email in self.emails:
6208fd26
NI
513 print(email)
514 print('')
58e64caf 515 self.mbox.close()
6208fd26
NI
516 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
517 print('They were saved in %s.' % self.mbox_temp_file)
518 self.mbox_temp.close()
58e64caf
AA
519
520#----------------------------------------------------------------------------#
521
522class CatchAllFilter(MboxFilter):
523
524 def initialize(self, mbox_file):
525 self.mbox_file = '%s.catchall' % mbox_file
526 self.mbox = mailbox.mbox(self.mbox_file)
527 self.mbox.clear()
528
529 def process(self, message):
530 self.mbox.add(message)
531 return True
532
533 def finalize(self):
534 if len(self.mbox) > 0:
6208fd26
NI
535 print('%d messages reached the catchall.' % len(self.mbox))
536 print('They were saved in %s.' % self.mbox_file)
537 print('You must process the contents of this mbox manually.')
58e64caf
AA
538 self.mbox.close()
539 else:
6208fd26 540 print('No messages reached the catchall. Nice.')
58e64caf
AA
541 self.mbox.close()
542 os.unlink(self.mbox_file)
543
544#----------------------------------------------------------------------------#
545
546if __name__ == '__main__':
547
548 if len(sys.argv) != 2:
6208fd26 549 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
550 sys.exit(1)
551
552 if not os.path.exists(sys.argv[1]):
6208fd26 553 print('No such file: %s' % sys.argv[1])
58e64caf
AA
554 sys.exit(1)
555
556 processor = MboxProcessor(sys.argv[1])
557 processor.run()