NL bounces: improve findAddressInBounce by adding new diagnostic codes
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126
6208fd26 127
58e64caf
AA
128def findAddressInBounce(bounce):
129 """Finds the faulty email address in a bounced email.
6208fd26 130
58e64caf
AA
131 See RFC 1894 for more information.
132 Returns None or the email address."""
6208fd26 133
58e64caf
AA
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce.get_content_type() != 'multipart/report':
6208fd26 136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
137 return None
138 # Extract the second component of the multipart/report
aa6c6ed4
AA
139 num_payloads = len(bounce.get_payload())
140 if num_payloads < 2:
6208fd26 141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
142 return None
143 status = bounce.get_payload(1)
144 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
146 return None
147 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
148 num_payloads = len(status.get_payload())
149 if num_payloads < 2:
6208fd26 150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
151 return None
152 content = status.get_payload(1)
153 if content.get_content_type() != 'text/plain':
4aad6c9c 154 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
155 return None
156 # Extract the faulty email address
157 recipient_match = _recipient_re.search(content['Final-Recipient'])
158 if recipient_match is None:
6208fd26 159 print('! Missing final recipient.')
58e64caf
AA
160 return None
161 email = recipient_match.group(1)
162 # Check the action field
4aad6c9c 163 if content['Action'].lower() != 'failed':
6208fd26 164 print('! Not a failed action (%s).' % content['Action'])
58e64caf 165 return None
29c6ffa5
NI
166
167 status = content['Status']
168 diag_code = content['Diagnostic-Code']
169
170 # Permanent failure state
171 if int(status[:1]) == 5:
172 return email
173
58e64caf 174 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
29c6ffa5
NI
175 if diag_code is not None and diag_code.startswith('X-Postfix'):
176 return email
177
178 failure_hints = [
179 "insufficient system storage",
180 "mailbox full",
181 "user unknown",
182 ]
183 if 'quota' in status.lower():
184 return email
185 if diag_code is not None:
186 ldiag_code = diag_code.lower()
187 if any(hint in ldiag_code for hint in failure_hints):
188 return email
189
190 print('! Not a permanent failure status (%s).' % status)
191 if diag_code is not None:
192 print('! Diagnostic code was: %s' % diag_code)
193 return None
58e64caf 194
15f4834d
NI
195
196def findAddressInPlainBounce(bounce):
197 """Finds the faulty email address in a non-RFC-1894 bounced email
198 """
199 if 'MAILER-DAEMON@' not in bounce['From'].upper():
200 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
201 return None
202 if bounce.get_content_type() != 'text/plain':
203 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
204 return None
205 subject = findSubject(bounce).lower()
206 if (subject != 'failure notice'
207 and subject != 'undeliverable message'
208 and not subject.startswith('mail delivery failed')
209 and subject != 'delivery status notification (failure)'):
210
211 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
212 return None
213
214 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
215 lines = bounce.get_payload().splitlines()[:15]
216
217 # Match:
218 # A message that you sent could not be delivered to one or more of its recipients.
219 # I'm afraid I wasn't able to deliver your message to the following addresses.
220 # The following message to <email@example.com> was undeliverable.
221 non_delivery_hints = [
222 "Delivery to the following recipient failed permanently",
223 "I wasn't able to deliver your message",
224 "> was undeliverable.",
225 "could not be delivered to",
226 "we were unable to deliver your message",
227 ]
228 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
229 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
230 print('\n'.join(lines))
231 return None
232
233 # Match:
234 # This is a permanent error; I've given up. Sorry it didn't work out.
235 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
236 permanent_error_hints = [
237 "Delivery to the following recipient failed permanently",
238 "This is a permanent error",
239 "Unknown address error",
240 "550 Requested action not taken",
241 ]
242 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
243 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
244 print('\n'.join(lines))
245 return None
246
247 # Retrieve the first occurence of <email@example.com>
248 for line in lines:
249 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
250 if match is None:
251 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
252 if match is not None:
253 email = match.group(1)
254 if email.endswith('@polytechnique.org'):
255 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
256 break
257 return email
258
259 print('! Unknown mailer-daemon message, unable to find email address:')
260 print('\n'.join(lines))
261 return None
262
58e64caf
AA
263#----------------------------------------------------------------------------#
264
265class DirectBouncesFilter(MboxFilter):
266
267 def initialize(self, mbox_file):
268 self.seen = 0
6208fd26 269 self.bad_problems = 0
58e64caf
AA
270 self.emails = []
271 self.mbox_file = '%s.bounced' % mbox_file
272 self.mbox = mailbox.mbox(self.mbox_file)
273 self.mbox.clear()
274
275 def process(self, message):
276 if message['X-Spam-Flag'] is None:
277 # During finalization, we will verifiy that all messages were processed
278 self.seen += 1
e0c82ac8
AA
279 # Special case: ignore mailman notifications for the mailing-list
280 # on which the NL is forwarded
6208fd26
NI
281 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
282 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
283 self.seen -= 1
284 return True
58e64caf 285 # Additionnal checks, just to be sure
e0c82ac8 286 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 287 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 288 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
289 else:
290 email = findAddressInBounce(message)
291 if email is not None:
292 self.emails.append(email)
293 self.mbox.add(message)
294 return True
295 else:
6208fd26
NI
296 print('! => No email found in direct bounce, this is really bad.')
297 self.bad_problems += 1
58e64caf
AA
298 return False
299
300 def finalize(self):
6208fd26
NI
301 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
302 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
303 print('They were saved in %s.' % self.mbox_file)
304 if self.bad_problems:
305 print('Found %d of them that are invalid.' % self.bad_problems)
306 if self.seen != len(self.mbox) + self.bad_problems:
307 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
308 print('')
309 print('Here is the list of email adresses for these bounces:')
310 print('')
58e64caf 311 for email in self.emails:
6208fd26
NI
312 print(email)
313 print('')
58e64caf
AA
314 self.mbox.close()
315
316#----------------------------------------------------------------------------#
317
318class SpamFilter(MboxFilter):
319
320 def initialize(self, mbox_file):
321 self.mbox_file = '%s.spam' % mbox_file
322 self.mbox = mailbox.mbox(self.mbox_file)
323 self.mbox.clear()
324
325 def process(self, message):
e0c82ac8
AA
326 if message['X-Spam-Flag'] is not None \
327 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
328 self.mbox.add(message)
329 return True
330 return False
331
332 def finalize(self):
6208fd26
NI
333 print('Found %d spams. This is reliable.' % len(self.mbox))
334 print('They were saved in %s.' % self.mbox_file)
335 print('You might check the contents of this mbox.')
58e64caf
AA
336 self.mbox.close()
337
338#----------------------------------------------------------------------------#
339
340class UnsureFilter(MboxFilter):
341
342 def initialize(self, mbox_file):
343 self.mbox_file = '%s.unsure' % mbox_file
344 self.mbox = mailbox.mbox(self.mbox_file)
345 self.mbox.clear()
346
347 def process(self, message):
e0c82ac8
AA
348 if message['X-Spam-Flag'] is not None \
349 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
350 self.mbox.add(message)
351 return True
352 return False
353
354 def finalize(self):
6208fd26
NI
355 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
356 print('They were saved in %s.' % self.mbox_file)
357 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
358 self.mbox.close()
359
360#----------------------------------------------------------------------------#
361
362class CheckNonSpamFilter(MboxFilter):
363
364 def initialize(self, mbox_file):
365 self.seen = 0
366
367 def process(self, message):
e0c82ac8
AA
368 if message['X-Spam-Flag'] is None \
369 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
370 self.seen += 1
371 return False
372
373 def finalize(self):
374 if self.seen > 0:
6208fd26
NI
375 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
376 print('Please investigate.')
58e64caf 377 else:
6208fd26 378 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
379
380#----------------------------------------------------------------------------#
381
382class OutOfOfficeFilter(MboxFilter):
383
384 def initialize(self, mbox_file):
385 self.mbox_file = '%s.ooo' % mbox_file
386 self.mbox = mailbox.mbox(self.mbox_file)
387 self.mbox.clear()
388 subject_re = [
389 r'^Absen(t|ce)',
6208fd26 390 r'^(AUTO: )?Out of (the )?office',
fc643049 391 r'^Auto( ?): ',
6208fd26 392 r'^Automatic reply: ',
aa6c6ed4 393 r'automatique d\'absence',
aa6c6ed4 394 r'AutoReply',
6208fd26
NI
395 r'(est|is) absent',
396 r'I am out of town',
397 r'I am currently away',
fc643049 398 r'(am|is) out of (the )?office',
6208fd26
NI
399 r'Notification d\'absence',
400 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 401 ]
6208fd26 402 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
403
404 def process(self, message):
405 subject = findSubject(message)
406 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
407 self.mbox.add(message)
408 return True
fc643049
NI
409
410 # Some systems reply with "Re: ". Be smart here!
411 if subject is not None and subject.startswith('Re: '):
412 # Delivered-To: Autoresponder
413 if 'Autoresponder' in message.get_all('Delivered-To'):
414 self.mbox.add(message)
415 return True
416 # Parse content if it is simple enough
417 if message.get_content_type() == 'text/plain':
418 firstline = message.get_payload().splitlines()[0].lower()
419 if (' absent du bureau ' in firstline
420 or ' away from my office ' in firstline):
421 self.mbox.add(message)
422 return True
423
58e64caf
AA
424 return False
425
426 def finalize(self):
6208fd26
NI
427 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
428 print('They were saved in %s.' % self.mbox_file)
429 print('You may check the contents of this mbox.')
58e64caf
AA
430 self.mbox.close()
431
432#----------------------------------------------------------------------------#
433
434class DeliveryStatusNotificationFilter(MboxFilter):
435
436 def initialize(self, mbox_file):
437 self.emails = []
438 self.mbox_file = '%s.dsn' % mbox_file
439 self.mbox = mailbox.mbox(self.mbox_file)
440 self.mbox.clear()
6208fd26
NI
441 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
442 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
443 self.mbox_temp.clear()
58e64caf
AA
444
445 def process(self, message):
15f4834d
NI
446 # Don't modify message variable for "self.mbox.add(message)"
447 report_message = message
448 # Find real report inside attachment
449 if message.get_content_type() == 'multipart/mixed':
450 report_message = message.get_payload(0)
451
452 # Process report if its type is correct
453 if report_message.get_content_type() == 'multipart/report':
454 email = findAddressInBounce(report_message)
58e64caf
AA
455 if email is not None:
456 self.emails.append(email)
457 self.mbox.add(message)
6208fd26
NI
458 else:
459 print("! => Moved to temporary DSN mailbox")
460 self.mbox_temp.add(message)
15f4834d
NI
461 return True
462
463 # Detect ill-formatted reports, sent as plain text email
464 if 'MAILER-DAEMON@' in message['From'].upper():
465 email = findAddressInPlainBounce(message)
466 if email is not None:
467 self.emails.append(email)
468 self.mbox.add(message)
6208fd26 469 return True
58e64caf
AA
470 return False
471
472 def finalize(self):
6208fd26
NI
473 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
474 print('They were saved in %s.' % self.mbox_file)
475 print('')
476 print('Here is the list of email adresses for these bounces:')
477 print('')
58e64caf 478 for email in self.emails:
6208fd26
NI
479 print(email)
480 print('')
58e64caf 481 self.mbox.close()
6208fd26
NI
482 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
483 print('They were saved in %s.' % self.mbox_temp_file)
484 self.mbox_temp.close()
58e64caf
AA
485
486#----------------------------------------------------------------------------#
487
488class CatchAllFilter(MboxFilter):
489
490 def initialize(self, mbox_file):
491 self.mbox_file = '%s.catchall' % mbox_file
492 self.mbox = mailbox.mbox(self.mbox_file)
493 self.mbox.clear()
494
495 def process(self, message):
496 self.mbox.add(message)
497 return True
498
499 def finalize(self):
500 if len(self.mbox) > 0:
6208fd26
NI
501 print('%d messages reached the catchall.' % len(self.mbox))
502 print('They were saved in %s.' % self.mbox_file)
503 print('You must process the contents of this mbox manually.')
58e64caf
AA
504 self.mbox.close()
505 else:
6208fd26 506 print('No messages reached the catchall. Nice.')
58e64caf
AA
507 self.mbox.close()
508 os.unlink(self.mbox_file)
509
510#----------------------------------------------------------------------------#
511
512if __name__ == '__main__':
513
514 if len(sys.argv) != 2:
6208fd26 515 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
516 sys.exit(1)
517
518 if not os.path.exists(sys.argv[1]):
6208fd26 519 print('No such file: %s' % sys.argv[1])
58e64caf
AA
520 sys.exit(1)
521
522 processor = MboxProcessor(sys.argv[1])
523 processor.run()