NL bounces: add another case to DSN filter
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
8c9c7d77
NI
126# Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127_recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
58e64caf 128
6208fd26 129
58e64caf
AA
130def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
6208fd26 132
58e64caf
AA
133 See RFC 1894 for more information.
134 Returns None or the email address."""
6208fd26 135
58e64caf
AA
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
6208fd26 138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
139 return None
140 # Extract the second component of the multipart/report
aa6c6ed4
AA
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
6208fd26 143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
144 return None
145 status = bounce.get_payload(1)
146 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 147 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
148 return None
149 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
150 num_payloads = len(status.get_payload())
151 if num_payloads < 2:
6208fd26 152 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
153 return None
154 content = status.get_payload(1)
155 if content.get_content_type() != 'text/plain':
4aad6c9c 156 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
157 return None
158 # Extract the faulty email address
159 recipient_match = _recipient_re.search(content['Final-Recipient'])
160 if recipient_match is None:
8c9c7d77
NI
161 # Be nice, test another regexp
162 recipient_match = _recipient_re2.search(content['Final-Recipient'])
163 if recipient_match is None:
164 print('! Missing final recipient.')
165 return None
58e64caf
AA
166 email = recipient_match.group(1)
167 # Check the action field
4aad6c9c 168 if content['Action'].lower() != 'failed':
6208fd26 169 print('! Not a failed action (%s).' % content['Action'])
58e64caf 170 return None
29c6ffa5
NI
171
172 status = content['Status']
173 diag_code = content['Diagnostic-Code']
174
175 # Permanent failure state
176 if int(status[:1]) == 5:
177 return email
178
58e64caf 179 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
29c6ffa5
NI
180 if diag_code is not None and diag_code.startswith('X-Postfix'):
181 return email
182
183 failure_hints = [
184 "insufficient system storage",
185 "mailbox full",
186 "user unknown",
187 ]
188 if 'quota' in status.lower():
189 return email
190 if diag_code is not None:
191 ldiag_code = diag_code.lower()
192 if any(hint in ldiag_code for hint in failure_hints):
193 return email
194
195 print('! Not a permanent failure status (%s).' % status)
196 if diag_code is not None:
197 print('! Diagnostic code was: %s' % diag_code)
198 return None
58e64caf 199
15f4834d
NI
200
201def findAddressInPlainBounce(bounce):
202 """Finds the faulty email address in a non-RFC-1894 bounced email
203 """
204 if 'MAILER-DAEMON@' not in bounce['From'].upper():
205 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
206 return None
207 if bounce.get_content_type() != 'text/plain':
208 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
209 return None
210 subject = findSubject(bounce).lower()
211 if (subject != 'failure notice'
212 and subject != 'undeliverable message'
213 and not subject.startswith('mail delivery failed')
214 and subject != 'delivery status notification (failure)'):
215
216 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
217 return None
218
219 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
220 lines = bounce.get_payload().splitlines()[:15]
221
222 # Match:
223 # A message that you sent could not be delivered to one or more of its recipients.
224 # I'm afraid I wasn't able to deliver your message to the following addresses.
225 # The following message to <email@example.com> was undeliverable.
226 non_delivery_hints = [
227 "Delivery to the following recipient failed permanently",
228 "I wasn't able to deliver your message",
229 "> was undeliverable.",
230 "could not be delivered to",
231 "we were unable to deliver your message",
232 ]
233 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
234 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
235 print('\n'.join(lines))
236 return None
237
238 # Match:
239 # This is a permanent error; I've given up. Sorry it didn't work out.
240 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
241 permanent_error_hints = [
242 "Delivery to the following recipient failed permanently",
243 "This is a permanent error",
244 "Unknown address error",
245 "550 Requested action not taken",
246 ]
247 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
248 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
249 print('\n'.join(lines))
250 return None
251
252 # Retrieve the first occurence of <email@example.com>
253 for line in lines:
254 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
255 if match is None:
256 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
257 if match is not None:
258 email = match.group(1)
259 if email.endswith('@polytechnique.org'):
260 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
261 break
262 return email
263
264 print('! Unknown mailer-daemon message, unable to find email address:')
265 print('\n'.join(lines))
266 return None
267
58e64caf
AA
268#----------------------------------------------------------------------------#
269
270class DirectBouncesFilter(MboxFilter):
271
272 def initialize(self, mbox_file):
273 self.seen = 0
6208fd26 274 self.bad_problems = 0
58e64caf
AA
275 self.emails = []
276 self.mbox_file = '%s.bounced' % mbox_file
277 self.mbox = mailbox.mbox(self.mbox_file)
278 self.mbox.clear()
279
280 def process(self, message):
281 if message['X-Spam-Flag'] is None:
282 # During finalization, we will verifiy that all messages were processed
283 self.seen += 1
e0c82ac8
AA
284 # Special case: ignore mailman notifications for the mailing-list
285 # on which the NL is forwarded
6208fd26
NI
286 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
287 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
288 self.seen -= 1
289 return True
58e64caf 290 # Additionnal checks, just to be sure
e0c82ac8 291 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 292 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 293 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
294 else:
295 email = findAddressInBounce(message)
296 if email is not None:
297 self.emails.append(email)
298 self.mbox.add(message)
299 return True
300 else:
6208fd26
NI
301 print('! => No email found in direct bounce, this is really bad.')
302 self.bad_problems += 1
58e64caf
AA
303 return False
304
305 def finalize(self):
6208fd26
NI
306 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
307 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
308 print('They were saved in %s.' % self.mbox_file)
309 if self.bad_problems:
310 print('Found %d of them that are invalid.' % self.bad_problems)
311 if self.seen != len(self.mbox) + self.bad_problems:
312 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
313 print('')
314 print('Here is the list of email adresses for these bounces:')
315 print('')
58e64caf 316 for email in self.emails:
6208fd26
NI
317 print(email)
318 print('')
58e64caf
AA
319 self.mbox.close()
320
321#----------------------------------------------------------------------------#
322
323class SpamFilter(MboxFilter):
324
325 def initialize(self, mbox_file):
326 self.mbox_file = '%s.spam' % mbox_file
327 self.mbox = mailbox.mbox(self.mbox_file)
328 self.mbox.clear()
329
330 def process(self, message):
e0c82ac8
AA
331 if message['X-Spam-Flag'] is not None \
332 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
333 self.mbox.add(message)
334 return True
335 return False
336
337 def finalize(self):
6208fd26
NI
338 print('Found %d spams. This is reliable.' % len(self.mbox))
339 print('They were saved in %s.' % self.mbox_file)
340 print('You might check the contents of this mbox.')
58e64caf
AA
341 self.mbox.close()
342
343#----------------------------------------------------------------------------#
344
345class UnsureFilter(MboxFilter):
346
347 def initialize(self, mbox_file):
348 self.mbox_file = '%s.unsure' % mbox_file
349 self.mbox = mailbox.mbox(self.mbox_file)
350 self.mbox.clear()
351
352 def process(self, message):
e0c82ac8
AA
353 if message['X-Spam-Flag'] is not None \
354 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
355 self.mbox.add(message)
356 return True
357 return False
358
359 def finalize(self):
6208fd26
NI
360 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
361 print('They were saved in %s.' % self.mbox_file)
362 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
363 self.mbox.close()
364
365#----------------------------------------------------------------------------#
366
367class CheckNonSpamFilter(MboxFilter):
368
369 def initialize(self, mbox_file):
370 self.seen = 0
371
372 def process(self, message):
e0c82ac8
AA
373 if message['X-Spam-Flag'] is None \
374 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
375 self.seen += 1
376 return False
377
378 def finalize(self):
379 if self.seen > 0:
6208fd26
NI
380 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
381 print('Please investigate.')
58e64caf 382 else:
6208fd26 383 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
384
385#----------------------------------------------------------------------------#
386
387class OutOfOfficeFilter(MboxFilter):
388
389 def initialize(self, mbox_file):
390 self.mbox_file = '%s.ooo' % mbox_file
391 self.mbox = mailbox.mbox(self.mbox_file)
392 self.mbox.clear()
393 subject_re = [
394 r'^Absen(t|ce)',
6208fd26 395 r'^(AUTO: )?Out of (the )?office',
fc643049 396 r'^Auto( ?): ',
6208fd26 397 r'^Automatic reply: ',
aa6c6ed4 398 r'automatique d\'absence',
aa6c6ed4 399 r'AutoReply',
6208fd26
NI
400 r'(est|is) absent',
401 r'I am out of town',
402 r'I am currently away',
fc643049 403 r'(am|is) out of (the )?office',
6208fd26
NI
404 r'Notification d\'absence',
405 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 406 ]
6208fd26 407 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
408
409 def process(self, message):
410 subject = findSubject(message)
411 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
412 self.mbox.add(message)
413 return True
fc643049
NI
414
415 # Some systems reply with "Re: ". Be smart here!
416 if subject is not None and subject.startswith('Re: '):
417 # Delivered-To: Autoresponder
418 if 'Autoresponder' in message.get_all('Delivered-To'):
419 self.mbox.add(message)
420 return True
421 # Parse content if it is simple enough
422 if message.get_content_type() == 'text/plain':
423 firstline = message.get_payload().splitlines()[0].lower()
424 if (' absent du bureau ' in firstline
425 or ' away from my office ' in firstline):
426 self.mbox.add(message)
427 return True
428
58e64caf
AA
429 return False
430
431 def finalize(self):
6208fd26
NI
432 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
433 print('They were saved in %s.' % self.mbox_file)
434 print('You may check the contents of this mbox.')
58e64caf
AA
435 self.mbox.close()
436
437#----------------------------------------------------------------------------#
438
439class DeliveryStatusNotificationFilter(MboxFilter):
440
441 def initialize(self, mbox_file):
442 self.emails = []
443 self.mbox_file = '%s.dsn' % mbox_file
444 self.mbox = mailbox.mbox(self.mbox_file)
445 self.mbox.clear()
6208fd26
NI
446 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
447 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
448 self.mbox_temp.clear()
58e64caf
AA
449
450 def process(self, message):
15f4834d
NI
451 # Don't modify message variable for "self.mbox.add(message)"
452 report_message = message
453 # Find real report inside attachment
454 if message.get_content_type() == 'multipart/mixed':
455 report_message = message.get_payload(0)
456
457 # Process report if its type is correct
458 if report_message.get_content_type() == 'multipart/report':
459 email = findAddressInBounce(report_message)
58e64caf
AA
460 if email is not None:
461 self.emails.append(email)
462 self.mbox.add(message)
6208fd26
NI
463 else:
464 print("! => Moved to temporary DSN mailbox")
465 self.mbox_temp.add(message)
15f4834d
NI
466 return True
467
468 # Detect ill-formatted reports, sent as plain text email
4cfc54b4
NI
469 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
470 email = findAddressInPlainBounce(report_message)
15f4834d
NI
471 if email is not None:
472 self.emails.append(email)
473 self.mbox.add(message)
6208fd26 474 return True
58e64caf
AA
475 return False
476
477 def finalize(self):
6208fd26
NI
478 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
479 print('They were saved in %s.' % self.mbox_file)
480 print('')
481 print('Here is the list of email adresses for these bounces:')
482 print('')
58e64caf 483 for email in self.emails:
6208fd26
NI
484 print(email)
485 print('')
58e64caf 486 self.mbox.close()
6208fd26
NI
487 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
488 print('They were saved in %s.' % self.mbox_temp_file)
489 self.mbox_temp.close()
58e64caf
AA
490
491#----------------------------------------------------------------------------#
492
493class CatchAllFilter(MboxFilter):
494
495 def initialize(self, mbox_file):
496 self.mbox_file = '%s.catchall' % mbox_file
497 self.mbox = mailbox.mbox(self.mbox_file)
498 self.mbox.clear()
499
500 def process(self, message):
501 self.mbox.add(message)
502 return True
503
504 def finalize(self):
505 if len(self.mbox) > 0:
6208fd26
NI
506 print('%d messages reached the catchall.' % len(self.mbox))
507 print('They were saved in %s.' % self.mbox_file)
508 print('You must process the contents of this mbox manually.')
58e64caf
AA
509 self.mbox.close()
510 else:
6208fd26 511 print('No messages reached the catchall. Nice.')
58e64caf
AA
512 self.mbox.close()
513 os.unlink(self.mbox_file)
514
515#----------------------------------------------------------------------------#
516
517if __name__ == '__main__':
518
519 if len(sys.argv) != 2:
6208fd26 520 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
521 sys.exit(1)
522
523 if not os.path.exists(sys.argv[1]):
6208fd26 524 print('No such file: %s' % sys.argv[1])
58e64caf
AA
525 sys.exit(1)
526
527 processor = MboxProcessor(sys.argv[1])
528 processor.run()