3335c1201acec0307392e5b9d92c4c83d849a71f
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email
36 import mailbox
37 import os
38 import re
39 import sys
40 import time
41
42 #----------------------------------------------------------------------------#
43
44 class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
73 for f in self.filters:
74 f.finalize()
75 print(separator)
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87 #----------------------------------------------------------------------------#
88
89 class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
94
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
100
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
106
107 This is the place to display the results and close all descriptors."""
108 pass
109
110 #----------------------------------------------------------------------------#
111
112 def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
124
125 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126
127
128 def findAddressInBounce(bounce):
129 """Finds the faulty email address in a bounced email.
130
131 See RFC 1894 for more information.
132 Returns None or the email address."""
133
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
137 return None
138 # Extract the second component of the multipart/report
139 num_payloads = len(bounce.get_payload())
140 if num_payloads < 2:
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
142 return None
143 status = bounce.get_payload(1)
144 if status.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
146 return None
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads = len(status.get_payload())
149 if num_payloads < 2:
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
151 return None
152 content = status.get_payload(1)
153 if content.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
155 return None
156 # Extract the faulty email address
157 recipient_match = _recipient_re.search(content['Final-Recipient'])
158 if recipient_match is None:
159 print('! Missing final recipient.')
160 return None
161 email = recipient_match.group(1)
162 # Check the action field
163 if content['Action'].lower() != 'failed':
164 print('! Not a failed action (%s).' % content['Action'])
165 return None
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error = content['Diagnostic-Code'] is not None \
169 and content['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error and int(content['Status'][:1]) != 5:
171 print('! Not a permanent failure status (%s).' % content['Status'])
172 return None
173 return email
174
175
176 def findAddressInPlainBounce(bounce):
177 """Finds the faulty email address in a non-RFC-1894 bounced email
178 """
179 if 'MAILER-DAEMON@' not in bounce['From'].upper():
180 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
181 return None
182 if bounce.get_content_type() != 'text/plain':
183 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
184 return None
185 subject = findSubject(bounce).lower()
186 if (subject != 'failure notice'
187 and subject != 'undeliverable message'
188 and not subject.startswith('mail delivery failed')
189 and subject != 'delivery status notification (failure)'):
190
191 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
192 return None
193
194 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
195 lines = bounce.get_payload().splitlines()[:15]
196
197 # Match:
198 # A message that you sent could not be delivered to one or more of its recipients.
199 # I'm afraid I wasn't able to deliver your message to the following addresses.
200 # The following message to <email@example.com> was undeliverable.
201 non_delivery_hints = [
202 "Delivery to the following recipient failed permanently",
203 "I wasn't able to deliver your message",
204 "> was undeliverable.",
205 "could not be delivered to",
206 "we were unable to deliver your message",
207 ]
208 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
209 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
210 print('\n'.join(lines))
211 return None
212
213 # Match:
214 # This is a permanent error; I've given up. Sorry it didn't work out.
215 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
216 permanent_error_hints = [
217 "Delivery to the following recipient failed permanently",
218 "This is a permanent error",
219 "Unknown address error",
220 "550 Requested action not taken",
221 ]
222 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
223 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
224 print('\n'.join(lines))
225 return None
226
227 # Retrieve the first occurence of <email@example.com>
228 for line in lines:
229 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
230 if match is None:
231 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
232 if match is not None:
233 email = match.group(1)
234 if email.endswith('@polytechnique.org'):
235 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
236 break
237 return email
238
239 print('! Unknown mailer-daemon message, unable to find email address:')
240 print('\n'.join(lines))
241 return None
242
243 #----------------------------------------------------------------------------#
244
245 class DirectBouncesFilter(MboxFilter):
246
247 def initialize(self, mbox_file):
248 self.seen = 0
249 self.bad_problems = 0
250 self.emails = []
251 self.mbox_file = '%s.bounced' % mbox_file
252 self.mbox = mailbox.mbox(self.mbox_file)
253 self.mbox.clear()
254
255 def process(self, message):
256 if message['X-Spam-Flag'] is None:
257 # During finalization, we will verifiy that all messages were processed
258 self.seen += 1
259 # Special case: ignore mailman notifications for the mailing-list
260 # on which the NL is forwarded
261 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
262 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
263 self.seen -= 1
264 return True
265 # Additionnal checks, just to be sure
266 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
267 or message['Subject'] != 'Undelivered Mail Returned to Sender':
268 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
269 else:
270 email = findAddressInBounce(message)
271 if email is not None:
272 self.emails.append(email)
273 self.mbox.add(message)
274 return True
275 else:
276 print('! => No email found in direct bounce, this is really bad.')
277 self.bad_problems += 1
278 return False
279
280 def finalize(self):
281 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
282 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
283 print('They were saved in %s.' % self.mbox_file)
284 if self.bad_problems:
285 print('Found %d of them that are invalid.' % self.bad_problems)
286 if self.seen != len(self.mbox) + self.bad_problems:
287 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
288 print('')
289 print('Here is the list of email adresses for these bounces:')
290 print('')
291 for email in self.emails:
292 print(email)
293 print('')
294 self.mbox.close()
295
296 #----------------------------------------------------------------------------#
297
298 class SpamFilter(MboxFilter):
299
300 def initialize(self, mbox_file):
301 self.mbox_file = '%s.spam' % mbox_file
302 self.mbox = mailbox.mbox(self.mbox_file)
303 self.mbox.clear()
304
305 def process(self, message):
306 if message['X-Spam-Flag'] is not None \
307 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
308 self.mbox.add(message)
309 return True
310 return False
311
312 def finalize(self):
313 print('Found %d spams. This is reliable.' % len(self.mbox))
314 print('They were saved in %s.' % self.mbox_file)
315 print('You might check the contents of this mbox.')
316 self.mbox.close()
317
318 #----------------------------------------------------------------------------#
319
320 class UnsureFilter(MboxFilter):
321
322 def initialize(self, mbox_file):
323 self.mbox_file = '%s.unsure' % mbox_file
324 self.mbox = mailbox.mbox(self.mbox_file)
325 self.mbox.clear()
326
327 def process(self, message):
328 if message['X-Spam-Flag'] is not None \
329 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
330 self.mbox.add(message)
331 return True
332 return False
333
334 def finalize(self):
335 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
336 print('They were saved in %s.' % self.mbox_file)
337 print('You must check the contents of this mbox and feed the antispam.')
338 self.mbox.close()
339
340 #----------------------------------------------------------------------------#
341
342 class CheckNonSpamFilter(MboxFilter):
343
344 def initialize(self, mbox_file):
345 self.seen = 0
346
347 def process(self, message):
348 if message['X-Spam-Flag'] is None \
349 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
350 self.seen += 1
351 return False
352
353 def finalize(self):
354 if self.seen > 0:
355 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
356 print('Please investigate.')
357 else:
358 print('All messages were either spam, or unsure, or non-spams. Good.')
359
360 #----------------------------------------------------------------------------#
361
362 class OutOfOfficeFilter(MboxFilter):
363
364 def initialize(self, mbox_file):
365 self.mbox_file = '%s.ooo' % mbox_file
366 self.mbox = mailbox.mbox(self.mbox_file)
367 self.mbox.clear()
368 subject_re = [
369 r'^Absen(t|ce)',
370 r'^(AUTO: )?Out of (the )?office',
371 r'^Auto( ?): ',
372 r'^Automatic reply: ',
373 r'automatique d\'absence',
374 r'AutoReply',
375 r'(est|is) absent',
376 r'I am out of town',
377 r'I am currently away',
378 r'(am|is) out of (the )?office',
379 r'Notification d\'absence',
380 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
381 ]
382 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
383
384 def process(self, message):
385 subject = findSubject(message)
386 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
387 self.mbox.add(message)
388 return True
389
390 # Some systems reply with "Re: ". Be smart here!
391 if subject is not None and subject.startswith('Re: '):
392 # Delivered-To: Autoresponder
393 if 'Autoresponder' in message.get_all('Delivered-To'):
394 self.mbox.add(message)
395 return True
396 # Parse content if it is simple enough
397 if message.get_content_type() == 'text/plain':
398 firstline = message.get_payload().splitlines()[0].lower()
399 if (' absent du bureau ' in firstline
400 or ' away from my office ' in firstline):
401 self.mbox.add(message)
402 return True
403
404 return False
405
406 def finalize(self):
407 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
408 print('They were saved in %s.' % self.mbox_file)
409 print('You may check the contents of this mbox.')
410 self.mbox.close()
411
412 #----------------------------------------------------------------------------#
413
414 class DeliveryStatusNotificationFilter(MboxFilter):
415
416 def initialize(self, mbox_file):
417 self.emails = []
418 self.mbox_file = '%s.dsn' % mbox_file
419 self.mbox = mailbox.mbox(self.mbox_file)
420 self.mbox.clear()
421 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
422 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
423 self.mbox_temp.clear()
424
425 def process(self, message):
426 # Don't modify message variable for "self.mbox.add(message)"
427 report_message = message
428 # Find real report inside attachment
429 if message.get_content_type() == 'multipart/mixed':
430 report_message = message.get_payload(0)
431
432 # Process report if its type is correct
433 if report_message.get_content_type() == 'multipart/report':
434 email = findAddressInBounce(report_message)
435 if email is not None:
436 self.emails.append(email)
437 self.mbox.add(message)
438 else:
439 print("! => Moved to temporary DSN mailbox")
440 self.mbox_temp.add(message)
441 return True
442
443 # Detect ill-formatted reports, sent as plain text email
444 if 'MAILER-DAEMON@' in message['From'].upper():
445 email = findAddressInPlainBounce(message)
446 if email is not None:
447 self.emails.append(email)
448 self.mbox.add(message)
449 return True
450 return False
451
452 def finalize(self):
453 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
454 print('They were saved in %s.' % self.mbox_file)
455 print('')
456 print('Here is the list of email adresses for these bounces:')
457 print('')
458 for email in self.emails:
459 print(email)
460 print('')
461 self.mbox.close()
462 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
463 print('They were saved in %s.' % self.mbox_temp_file)
464 self.mbox_temp.close()
465
466 #----------------------------------------------------------------------------#
467
468 class CatchAllFilter(MboxFilter):
469
470 def initialize(self, mbox_file):
471 self.mbox_file = '%s.catchall' % mbox_file
472 self.mbox = mailbox.mbox(self.mbox_file)
473 self.mbox.clear()
474
475 def process(self, message):
476 self.mbox.add(message)
477 return True
478
479 def finalize(self):
480 if len(self.mbox) > 0:
481 print('%d messages reached the catchall.' % len(self.mbox))
482 print('They were saved in %s.' % self.mbox_file)
483 print('You must process the contents of this mbox manually.')
484 self.mbox.close()
485 else:
486 print('No messages reached the catchall. Nice.')
487 self.mbox.close()
488 os.unlink(self.mbox_file)
489
490 #----------------------------------------------------------------------------#
491
492 if __name__ == '__main__':
493
494 if len(sys.argv) != 2:
495 print('Usage: %s mbox' % sys.argv[0])
496 sys.exit(1)
497
498 if not os.path.exists(sys.argv[1]):
499 print('No such file: %s' % sys.argv[1])
500 sys.exit(1)
501
502 processor = MboxProcessor(sys.argv[1])
503 processor.run()