NL bounces: improve DeliveryStatusNotificationFilter with two new DSN types
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126
6208fd26 127
58e64caf
AA
128def findAddressInBounce(bounce):
129 """Finds the faulty email address in a bounced email.
6208fd26 130
58e64caf
AA
131 See RFC 1894 for more information.
132 Returns None or the email address."""
6208fd26 133
58e64caf
AA
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce.get_content_type() != 'multipart/report':
6208fd26 136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
137 return None
138 # Extract the second component of the multipart/report
aa6c6ed4
AA
139 num_payloads = len(bounce.get_payload())
140 if num_payloads < 2:
6208fd26 141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
142 return None
143 status = bounce.get_payload(1)
144 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
146 return None
147 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
148 num_payloads = len(status.get_payload())
149 if num_payloads < 2:
6208fd26 150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
151 return None
152 content = status.get_payload(1)
153 if content.get_content_type() != 'text/plain':
4aad6c9c 154 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
155 return None
156 # Extract the faulty email address
157 recipient_match = _recipient_re.search(content['Final-Recipient'])
158 if recipient_match is None:
6208fd26 159 print('! Missing final recipient.')
58e64caf
AA
160 return None
161 email = recipient_match.group(1)
162 # Check the action field
4aad6c9c 163 if content['Action'].lower() != 'failed':
6208fd26 164 print('! Not a failed action (%s).' % content['Action'])
58e64caf
AA
165 return None
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error = content['Diagnostic-Code'] is not None \
169 and content['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error and int(content['Status'][:1]) != 5:
6208fd26 171 print('! Not a permanent failure status (%s).' % content['Status'])
58e64caf
AA
172 return None
173 return email
174
15f4834d
NI
175
176def findAddressInPlainBounce(bounce):
177 """Finds the faulty email address in a non-RFC-1894 bounced email
178 """
179 if 'MAILER-DAEMON@' not in bounce['From'].upper():
180 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
181 return None
182 if bounce.get_content_type() != 'text/plain':
183 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
184 return None
185 subject = findSubject(bounce).lower()
186 if (subject != 'failure notice'
187 and subject != 'undeliverable message'
188 and not subject.startswith('mail delivery failed')
189 and subject != 'delivery status notification (failure)'):
190
191 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
192 return None
193
194 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
195 lines = bounce.get_payload().splitlines()[:15]
196
197 # Match:
198 # A message that you sent could not be delivered to one or more of its recipients.
199 # I'm afraid I wasn't able to deliver your message to the following addresses.
200 # The following message to <email@example.com> was undeliverable.
201 non_delivery_hints = [
202 "Delivery to the following recipient failed permanently",
203 "I wasn't able to deliver your message",
204 "> was undeliverable.",
205 "could not be delivered to",
206 "we were unable to deliver your message",
207 ]
208 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
209 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
210 print('\n'.join(lines))
211 return None
212
213 # Match:
214 # This is a permanent error; I've given up. Sorry it didn't work out.
215 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
216 permanent_error_hints = [
217 "Delivery to the following recipient failed permanently",
218 "This is a permanent error",
219 "Unknown address error",
220 "550 Requested action not taken",
221 ]
222 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
223 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
224 print('\n'.join(lines))
225 return None
226
227 # Retrieve the first occurence of <email@example.com>
228 for line in lines:
229 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
230 if match is None:
231 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
232 if match is not None:
233 email = match.group(1)
234 if email.endswith('@polytechnique.org'):
235 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
236 break
237 return email
238
239 print('! Unknown mailer-daemon message, unable to find email address:')
240 print('\n'.join(lines))
241 return None
242
58e64caf
AA
243#----------------------------------------------------------------------------#
244
245class DirectBouncesFilter(MboxFilter):
246
247 def initialize(self, mbox_file):
248 self.seen = 0
6208fd26 249 self.bad_problems = 0
58e64caf
AA
250 self.emails = []
251 self.mbox_file = '%s.bounced' % mbox_file
252 self.mbox = mailbox.mbox(self.mbox_file)
253 self.mbox.clear()
254
255 def process(self, message):
256 if message['X-Spam-Flag'] is None:
257 # During finalization, we will verifiy that all messages were processed
258 self.seen += 1
e0c82ac8
AA
259 # Special case: ignore mailman notifications for the mailing-list
260 # on which the NL is forwarded
6208fd26
NI
261 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
262 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
263 self.seen -= 1
264 return True
58e64caf 265 # Additionnal checks, just to be sure
e0c82ac8 266 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 267 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 268 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
269 else:
270 email = findAddressInBounce(message)
271 if email is not None:
272 self.emails.append(email)
273 self.mbox.add(message)
274 return True
275 else:
6208fd26
NI
276 print('! => No email found in direct bounce, this is really bad.')
277 self.bad_problems += 1
58e64caf
AA
278 return False
279
280 def finalize(self):
6208fd26
NI
281 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
282 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
283 print('They were saved in %s.' % self.mbox_file)
284 if self.bad_problems:
285 print('Found %d of them that are invalid.' % self.bad_problems)
286 if self.seen != len(self.mbox) + self.bad_problems:
287 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
288 print('')
289 print('Here is the list of email adresses for these bounces:')
290 print('')
58e64caf 291 for email in self.emails:
6208fd26
NI
292 print(email)
293 print('')
58e64caf
AA
294 self.mbox.close()
295
296#----------------------------------------------------------------------------#
297
298class SpamFilter(MboxFilter):
299
300 def initialize(self, mbox_file):
301 self.mbox_file = '%s.spam' % mbox_file
302 self.mbox = mailbox.mbox(self.mbox_file)
303 self.mbox.clear()
304
305 def process(self, message):
e0c82ac8
AA
306 if message['X-Spam-Flag'] is not None \
307 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
308 self.mbox.add(message)
309 return True
310 return False
311
312 def finalize(self):
6208fd26
NI
313 print('Found %d spams. This is reliable.' % len(self.mbox))
314 print('They were saved in %s.' % self.mbox_file)
315 print('You might check the contents of this mbox.')
58e64caf
AA
316 self.mbox.close()
317
318#----------------------------------------------------------------------------#
319
320class UnsureFilter(MboxFilter):
321
322 def initialize(self, mbox_file):
323 self.mbox_file = '%s.unsure' % mbox_file
324 self.mbox = mailbox.mbox(self.mbox_file)
325 self.mbox.clear()
326
327 def process(self, message):
e0c82ac8
AA
328 if message['X-Spam-Flag'] is not None \
329 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
330 self.mbox.add(message)
331 return True
332 return False
333
334 def finalize(self):
6208fd26
NI
335 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
336 print('They were saved in %s.' % self.mbox_file)
337 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
338 self.mbox.close()
339
340#----------------------------------------------------------------------------#
341
342class CheckNonSpamFilter(MboxFilter):
343
344 def initialize(self, mbox_file):
345 self.seen = 0
346
347 def process(self, message):
e0c82ac8
AA
348 if message['X-Spam-Flag'] is None \
349 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
350 self.seen += 1
351 return False
352
353 def finalize(self):
354 if self.seen > 0:
6208fd26
NI
355 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
356 print('Please investigate.')
58e64caf 357 else:
6208fd26 358 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
359
360#----------------------------------------------------------------------------#
361
362class OutOfOfficeFilter(MboxFilter):
363
364 def initialize(self, mbox_file):
365 self.mbox_file = '%s.ooo' % mbox_file
366 self.mbox = mailbox.mbox(self.mbox_file)
367 self.mbox.clear()
368 subject_re = [
369 r'^Absen(t|ce)',
6208fd26 370 r'^(AUTO: )?Out of (the )?office',
fc643049 371 r'^Auto( ?): ',
6208fd26 372 r'^Automatic reply: ',
aa6c6ed4 373 r'automatique d\'absence',
aa6c6ed4 374 r'AutoReply',
6208fd26
NI
375 r'(est|is) absent',
376 r'I am out of town',
377 r'I am currently away',
fc643049 378 r'(am|is) out of (the )?office',
6208fd26
NI
379 r'Notification d\'absence',
380 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 381 ]
6208fd26 382 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
383
384 def process(self, message):
385 subject = findSubject(message)
386 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
387 self.mbox.add(message)
388 return True
fc643049
NI
389
390 # Some systems reply with "Re: ". Be smart here!
391 if subject is not None and subject.startswith('Re: '):
392 # Delivered-To: Autoresponder
393 if 'Autoresponder' in message.get_all('Delivered-To'):
394 self.mbox.add(message)
395 return True
396 # Parse content if it is simple enough
397 if message.get_content_type() == 'text/plain':
398 firstline = message.get_payload().splitlines()[0].lower()
399 if (' absent du bureau ' in firstline
400 or ' away from my office ' in firstline):
401 self.mbox.add(message)
402 return True
403
58e64caf
AA
404 return False
405
406 def finalize(self):
6208fd26
NI
407 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
408 print('They were saved in %s.' % self.mbox_file)
409 print('You may check the contents of this mbox.')
58e64caf
AA
410 self.mbox.close()
411
412#----------------------------------------------------------------------------#
413
414class DeliveryStatusNotificationFilter(MboxFilter):
415
416 def initialize(self, mbox_file):
417 self.emails = []
418 self.mbox_file = '%s.dsn' % mbox_file
419 self.mbox = mailbox.mbox(self.mbox_file)
420 self.mbox.clear()
6208fd26
NI
421 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
422 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
423 self.mbox_temp.clear()
58e64caf
AA
424
425 def process(self, message):
15f4834d
NI
426 # Don't modify message variable for "self.mbox.add(message)"
427 report_message = message
428 # Find real report inside attachment
429 if message.get_content_type() == 'multipart/mixed':
430 report_message = message.get_payload(0)
431
432 # Process report if its type is correct
433 if report_message.get_content_type() == 'multipart/report':
434 email = findAddressInBounce(report_message)
58e64caf
AA
435 if email is not None:
436 self.emails.append(email)
437 self.mbox.add(message)
6208fd26
NI
438 else:
439 print("! => Moved to temporary DSN mailbox")
440 self.mbox_temp.add(message)
15f4834d
NI
441 return True
442
443 # Detect ill-formatted reports, sent as plain text email
444 if 'MAILER-DAEMON@' in message['From'].upper():
445 email = findAddressInPlainBounce(message)
446 if email is not None:
447 self.emails.append(email)
448 self.mbox.add(message)
6208fd26 449 return True
58e64caf
AA
450 return False
451
452 def finalize(self):
6208fd26
NI
453 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
454 print('They were saved in %s.' % self.mbox_file)
455 print('')
456 print('Here is the list of email adresses for these bounces:')
457 print('')
58e64caf 458 for email in self.emails:
6208fd26
NI
459 print(email)
460 print('')
58e64caf 461 self.mbox.close()
6208fd26
NI
462 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
463 print('They were saved in %s.' % self.mbox_temp_file)
464 self.mbox_temp.close()
58e64caf
AA
465
466#----------------------------------------------------------------------------#
467
468class CatchAllFilter(MboxFilter):
469
470 def initialize(self, mbox_file):
471 self.mbox_file = '%s.catchall' % mbox_file
472 self.mbox = mailbox.mbox(self.mbox_file)
473 self.mbox.clear()
474
475 def process(self, message):
476 self.mbox.add(message)
477 return True
478
479 def finalize(self):
480 if len(self.mbox) > 0:
6208fd26
NI
481 print('%d messages reached the catchall.' % len(self.mbox))
482 print('They were saved in %s.' % self.mbox_file)
483 print('You must process the contents of this mbox manually.')
58e64caf
AA
484 self.mbox.close()
485 else:
6208fd26 486 print('No messages reached the catchall. Nice.')
58e64caf
AA
487 self.mbox.close()
488 os.unlink(self.mbox_file)
489
490#----------------------------------------------------------------------------#
491
492if __name__ == '__main__':
493
494 if len(sys.argv) != 2:
6208fd26 495 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
496 sys.exit(1)
497
498 if not os.path.exists(sys.argv[1]):
6208fd26 499 print('No such file: %s' % sys.argv[1])
58e64caf
AA
500 sys.exit(1)
501
502 processor = MboxProcessor(sys.argv[1])
503 processor.run()