95110b9923b761d9f42f3a84ad6e5b2655a5c5a4
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email
36 import mailbox
37 import os
38 import re
39 import sys
40 import time
41
42 #----------------------------------------------------------------------------#
43
44 class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
73 for f in self.filters:
74 f.finalize()
75 print(separator)
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87 #----------------------------------------------------------------------------#
88
89 class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
94
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
100
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
106
107 This is the place to display the results and close all descriptors."""
108 pass
109
110 #----------------------------------------------------------------------------#
111
112 def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
124
125 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
128
129
130 def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
132
133 See RFC 1894 for more information.
134 Returns None or the email address."""
135
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
139 return None
140 # Extract the second component of the multipart/report
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
144 return None
145 status = bounce.get_payload(1)
146 if status.get_content_type() != 'message/delivery-status':
147 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
148 return None
149 # The per-message-fields don't matter here, get only the per-recipient-fields
150 num_payloads = len(status.get_payload())
151 if num_payloads < 2:
152 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
153 return None
154 content = status.get_payload(1)
155 if content.get_content_type() != 'text/plain':
156 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
157 return None
158 # Extract the faulty email address
159 recipient_match = _recipient_re.search(content['Final-Recipient'])
160 if recipient_match is None:
161 # Be nice, test another regexp
162 recipient_match = _recipient_re2.search(content['Final-Recipient'])
163 if recipient_match is None:
164 print('! Missing final recipient.')
165 return None
166 email = recipient_match.group(1)
167 # Check the action field
168 if content['Action'].lower() != 'failed':
169 print('! Not a failed action (%s).' % content['Action'])
170 return None
171
172 status = content['Status']
173 diag_code = content['Diagnostic-Code']
174
175 # Permanent failure state
176 if int(status[:1]) == 5:
177 return email
178
179 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
180 if diag_code is not None and diag_code.startswith('X-Postfix'):
181 return email
182
183 failure_hints = [
184 "insufficient system storage",
185 "mailbox full",
186 "user unknown",
187 ]
188 if 'quota' in status.lower():
189 return email
190 if diag_code is not None:
191 ldiag_code = diag_code.lower()
192 if any(hint in ldiag_code for hint in failure_hints):
193 return email
194
195 print('! Not a permanent failure status (%s).' % status)
196 if diag_code is not None:
197 print('! Diagnostic code was: %s' % diag_code)
198 return None
199
200
201 def findAddressInPlainBounce(bounce):
202 """Finds the faulty email address in a non-RFC-1894 bounced email
203 """
204 if 'MAILER-DAEMON@' not in bounce['From'].upper():
205 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
206 return None
207 if bounce.get_content_type() != 'text/plain':
208 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
209 return None
210 subject = findSubject(bounce).lower()
211 known_subjects = [
212 "delivery status notification (failure)",
213 "failure notice",
214 "returned mail: see transcript for details",
215 "undeliverable message",
216 "undelivered mail returned to sender",
217 ]
218 if subject not in known_subjects and not subject.startswith('mail delivery failed'):
219 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
220 return None
221
222 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
223 lines = bounce.get_payload().splitlines()[:15]
224
225 # ALTOSPAM is a service which requires to click on a link when sending an email
226 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
227 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
228 if any("ALTOSPAM which is used by the person" in line for line in lines):
229 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
230 return None
231
232 # Match:
233 # A message that you sent could not be delivered to one or more of its recipients.
234 # I'm afraid I wasn't able to deliver your message to the following addresses.
235 # The following message to <email@example.com> was undeliverable.
236 non_delivery_hints = [
237 "Delivery to the following recipient failed permanently",
238 "I'm sorry to have to inform you that your message could not",
239 "I wasn't able to deliver your message",
240 "> was undeliverable.",
241 "could not be delivered to",
242 "we were unable to deliver your message",
243 ]
244 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
245 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
246 print('\n'.join(lines))
247 return None
248
249 # Match:
250 # This is a permanent error; I've given up. Sorry it didn't work out.
251 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
252 permanent_error_hints = [
253 "Delivery to the following recipient failed permanently",
254 "I'm sorry to have to inform you that your message could not",
255 "This is a permanent error",
256 "Unknown address error",
257 "550 Requested action not taken",
258 ]
259 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
260 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
261 print('\n'.join(lines))
262 return None
263
264 # Retrieve the first occurence of <email@example.com>
265 for line in lines:
266 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
267 if match is None:
268 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
269 if match is not None:
270 email = match.group(1)
271 if email.endswith('@polytechnique.org'):
272 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
273 break
274 return email
275
276 print('! Unknown mailer-daemon message, unable to find email address:')
277 print('\n'.join(lines))
278 return None
279
280 #----------------------------------------------------------------------------#
281
282 class DirectBouncesFilter(MboxFilter):
283
284 def initialize(self, mbox_file):
285 self.seen = 0
286 self.bad_problems = 0
287 self.emails = []
288 self.mbox_file = '%s.bounced' % mbox_file
289 self.mbox = mailbox.mbox(self.mbox_file)
290 self.mbox.clear()
291
292 def process(self, message):
293 if message['X-Spam-Flag'] is None:
294 # During finalization, we will verifiy that all messages were processed
295 self.seen += 1
296 # Special case: ignore mailman notifications for the mailing-list
297 # on which the NL is forwarded
298 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
299 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
300 self.seen -= 1
301 return True
302 # Additionnal checks, just to be sure
303 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
304 or message['Subject'] != 'Undelivered Mail Returned to Sender':
305 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
306 else:
307 email = findAddressInBounce(message)
308 if email is not None:
309 self.emails.append(email)
310 self.mbox.add(message)
311 return True
312 else:
313 print('! => No email found in direct bounce, this is really bad.')
314 self.bad_problems += 1
315 return False
316
317 def finalize(self):
318 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
319 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
320 print('They were saved in %s.' % self.mbox_file)
321 if self.bad_problems:
322 print('Found %d of them that are invalid.' % self.bad_problems)
323 if self.seen != len(self.mbox) + self.bad_problems:
324 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
325 print('')
326 print('Here is the list of email adresses for these bounces:')
327 print('')
328 for email in self.emails:
329 print(email)
330 print('')
331 self.mbox.close()
332
333 #----------------------------------------------------------------------------#
334
335 class SpamFilter(MboxFilter):
336
337 def initialize(self, mbox_file):
338 self.mbox_file = '%s.spam' % mbox_file
339 self.mbox = mailbox.mbox(self.mbox_file)
340 self.mbox.clear()
341
342 def process(self, message):
343 if message['X-Spam-Flag'] is not None \
344 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
345 self.mbox.add(message)
346 return True
347 return False
348
349 def finalize(self):
350 print('Found %d spams. This is reliable.' % len(self.mbox))
351 print('They were saved in %s.' % self.mbox_file)
352 print('You might check the contents of this mbox.')
353 self.mbox.close()
354
355 #----------------------------------------------------------------------------#
356
357 class UnsureFilter(MboxFilter):
358
359 def initialize(self, mbox_file):
360 self.mbox_file = '%s.unsure' % mbox_file
361 self.mbox = mailbox.mbox(self.mbox_file)
362 self.mbox.clear()
363
364 def process(self, message):
365 if message['X-Spam-Flag'] is not None \
366 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
367 self.mbox.add(message)
368 return True
369 return False
370
371 def finalize(self):
372 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
373 print('They were saved in %s.' % self.mbox_file)
374 print('You must check the contents of this mbox and feed the antispam.')
375 self.mbox.close()
376
377 #----------------------------------------------------------------------------#
378
379 class CheckNonSpamFilter(MboxFilter):
380
381 def initialize(self, mbox_file):
382 self.seen = 0
383
384 def process(self, message):
385 if message['X-Spam-Flag'] is None \
386 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
387 self.seen += 1
388 return False
389
390 def finalize(self):
391 if self.seen > 0:
392 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
393 print('Please investigate.')
394 else:
395 print('All messages were either spam, or unsure, or non-spams. Good.')
396
397 #----------------------------------------------------------------------------#
398
399 class OutOfOfficeFilter(MboxFilter):
400
401 def initialize(self, mbox_file):
402 self.mbox_file = '%s.ooo' % mbox_file
403 self.mbox = mailbox.mbox(self.mbox_file)
404 self.mbox.clear()
405 subject_re = [
406 r'^Absen(t|ce)',
407 r'^(AUTO: )?Out of (the )?office',
408 r'^Auto( ?): ',
409 r'^AutoRe( ?):',
410 r'^Automatic reply: ',
411 r'automatique d\'absence',
412 r'AutoReply',
413 r'(est|is) absent',
414 r'I am out of town',
415 r'I am currently away',
416 r'(am|is) out of (the )?office',
417 r'Notification d\'absence',
418 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
419 ]
420 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
421
422 def process(self, message):
423 subject = findSubject(message)
424 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
425 self.mbox.add(message)
426 return True
427
428 # Some systems reply with "Re: ". Be smart here!
429 if subject is not None and subject.startswith('Re: '):
430 # Delivered-To: Autoresponder
431 if 'Autoresponder' in message.get_all('Delivered-To'):
432 self.mbox.add(message)
433 return True
434 # Parse content if it is simple enough
435 if message.get_content_type() == 'text/plain':
436 firstline = message.get_payload().splitlines()[0].lower()
437 if (' absent du bureau ' in firstline
438 or ' away from my office ' in firstline):
439 self.mbox.add(message)
440 return True
441
442 return False
443
444 def finalize(self):
445 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
446 print('They were saved in %s.' % self.mbox_file)
447 print('You may check the contents of this mbox.')
448 self.mbox.close()
449
450 #----------------------------------------------------------------------------#
451
452 class DeliveryStatusNotificationFilter(MboxFilter):
453
454 def initialize(self, mbox_file):
455 self.emails = []
456 self.mbox_file = '%s.dsn' % mbox_file
457 self.mbox = mailbox.mbox(self.mbox_file)
458 self.mbox.clear()
459 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
460 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
461 self.mbox_temp.clear()
462
463 def process(self, message):
464 # Don't modify message variable for "self.mbox.add(message)"
465 report_message = message
466 # Find real report inside attachment
467 if message.get_content_type() == 'multipart/mixed':
468 report_message = message.get_payload(0)
469
470 # Process report if its type is correct
471 if report_message.get_content_type() == 'multipart/report':
472 email = findAddressInBounce(report_message)
473 if email is not None:
474 self.emails.append(email)
475 self.mbox.add(message)
476 else:
477 print("! => Moved to temporary DSN mailbox")
478 self.mbox_temp.add(message)
479 return True
480
481 # Detect ill-formatted reports, sent as plain text email
482 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
483 email = findAddressInPlainBounce(report_message)
484 if email is not None:
485 self.emails.append(email)
486 self.mbox.add(message)
487 return True
488 return False
489
490 def finalize(self):
491 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
492 print('They were saved in %s.' % self.mbox_file)
493 print('')
494 print('Here is the list of email adresses for these bounces:')
495 print('')
496 for email in self.emails:
497 print(email)
498 print('')
499 self.mbox.close()
500 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
501 print('They were saved in %s.' % self.mbox_temp_file)
502 self.mbox_temp.close()
503
504 #----------------------------------------------------------------------------#
505
506 class CatchAllFilter(MboxFilter):
507
508 def initialize(self, mbox_file):
509 self.mbox_file = '%s.catchall' % mbox_file
510 self.mbox = mailbox.mbox(self.mbox_file)
511 self.mbox.clear()
512
513 def process(self, message):
514 self.mbox.add(message)
515 return True
516
517 def finalize(self):
518 if len(self.mbox) > 0:
519 print('%d messages reached the catchall.' % len(self.mbox))
520 print('They were saved in %s.' % self.mbox_file)
521 print('You must process the contents of this mbox manually.')
522 self.mbox.close()
523 else:
524 print('No messages reached the catchall. Nice.')
525 self.mbox.close()
526 os.unlink(self.mbox_file)
527
528 #----------------------------------------------------------------------------#
529
530 if __name__ == '__main__':
531
532 if len(sys.argv) != 2:
533 print('Usage: %s mbox' % sys.argv[0])
534 sys.exit(1)
535
536 if not os.path.exists(sys.argv[1]):
537 print('No such file: %s' % sys.argv[1])
538 sys.exit(1)
539
540 processor = MboxProcessor(sys.argv[1])
541 processor.run()