NL bounces: try findAddressInPlainBounce if findAddressInBounce fails because the...
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
8c9c7d77
NI
126# Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127_recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
58e64caf 128
6208fd26 129
58e64caf
AA
130def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
6208fd26 132
58e64caf
AA
133 See RFC 1894 for more information.
134 Returns None or the email address."""
6208fd26 135
58e64caf
AA
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
6208fd26 138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
139 return None
140 # Extract the second component of the multipart/report
aa6c6ed4
AA
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
6208fd26 143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
144 return None
145 status = bounce.get_payload(1)
0cec3fee
NI
146
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status.get_content_type() == 'message/rfc822':
150 text_bounce = bounce.get_payload(0)
151 if text_bounce.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce, bounce)
153 # If it's not a text message, let's continue to the next error message
154
58e64caf 155 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
157 return None
158 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
159 num_payloads = len(status.get_payload())
160 if num_payloads < 2:
6208fd26 161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
162 return None
163 content = status.get_payload(1)
164 if content.get_content_type() != 'text/plain':
4aad6c9c 165 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
166 return None
167 # Extract the faulty email address
168 recipient_match = _recipient_re.search(content['Final-Recipient'])
169 if recipient_match is None:
8c9c7d77
NI
170 # Be nice, test another regexp
171 recipient_match = _recipient_re2.search(content['Final-Recipient'])
172 if recipient_match is None:
173 print('! Missing final recipient.')
174 return None
58e64caf
AA
175 email = recipient_match.group(1)
176 # Check the action field
4aad6c9c 177 if content['Action'].lower() != 'failed':
6208fd26 178 print('! Not a failed action (%s).' % content['Action'])
58e64caf 179 return None
29c6ffa5
NI
180
181 status = content['Status']
182 diag_code = content['Diagnostic-Code']
183
184 # Permanent failure state
185 if int(status[:1]) == 5:
186 return email
187
58e64caf 188 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
29c6ffa5
NI
189 if diag_code is not None and diag_code.startswith('X-Postfix'):
190 return email
191
192 failure_hints = [
193 "insufficient system storage",
194 "mailbox full",
195 "user unknown",
196 ]
197 if 'quota' in status.lower():
198 return email
199 if diag_code is not None:
200 ldiag_code = diag_code.lower()
201 if any(hint in ldiag_code for hint in failure_hints):
202 return email
203
204 print('! Not a permanent failure status (%s).' % status)
205 if diag_code is not None:
206 print('! Diagnostic code was: %s' % diag_code)
207 return None
58e64caf 208
15f4834d 209
0cec3fee 210def findAddressInPlainBounce(bounce, real_bounce=None):
15f4834d
NI
211 """Finds the faulty email address in a non-RFC-1894 bounced email
212 """
0cec3fee
NI
213 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
214 real_bounce = real_bounce or bounce
215 if 'MAILER-DAEMON@' not in real_bounce['From'].upper():
15f4834d
NI
216 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
217 return None
218 if bounce.get_content_type() != 'text/plain':
219 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
220 return None
0cec3fee 221 subject = findSubject(real_bounce).lower()
fa7bc030
NI
222 known_subjects = [
223 "delivery status notification (failure)",
224 "failure notice",
225 "returned mail: see transcript for details",
226 "undeliverable message",
227 "undelivered mail returned to sender",
228 ]
229 if subject not in known_subjects and not subject.startswith('mail delivery failed'):
15f4834d
NI
230 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
231 return None
232
233 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
234 lines = bounce.get_payload().splitlines()[:15]
235
222984e4
NI
236 # ALTOSPAM is a service which requires to click on a link when sending an email
237 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
238 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
239 if any("ALTOSPAM which is used by the person" in line for line in lines):
240 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
241 return None
242
15f4834d
NI
243 # Match:
244 # A message that you sent could not be delivered to one or more of its recipients.
245 # I'm afraid I wasn't able to deliver your message to the following addresses.
246 # The following message to <email@example.com> was undeliverable.
247 non_delivery_hints = [
248 "Delivery to the following recipient failed permanently",
fa7bc030 249 "I'm sorry to have to inform you that your message could not",
15f4834d
NI
250 "I wasn't able to deliver your message",
251 "> was undeliverable.",
252 "could not be delivered to",
253 "we were unable to deliver your message",
254 ]
255 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
256 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
257 print('\n'.join(lines))
258 return None
259
260 # Match:
261 # This is a permanent error; I've given up. Sorry it didn't work out.
262 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
263 permanent_error_hints = [
264 "Delivery to the following recipient failed permanently",
fa7bc030 265 "I'm sorry to have to inform you that your message could not",
15f4834d
NI
266 "This is a permanent error",
267 "Unknown address error",
268 "550 Requested action not taken",
269 ]
270 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
271 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
272 print('\n'.join(lines))
273 return None
274
275 # Retrieve the first occurence of <email@example.com>
276 for line in lines:
277 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
278 if match is None:
279 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
280 if match is not None:
281 email = match.group(1)
282 if email.endswith('@polytechnique.org'):
283 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
284 break
285 return email
286
287 print('! Unknown mailer-daemon message, unable to find email address:')
288 print('\n'.join(lines))
289 return None
290
58e64caf
AA
291#----------------------------------------------------------------------------#
292
293class DirectBouncesFilter(MboxFilter):
294
295 def initialize(self, mbox_file):
296 self.seen = 0
6208fd26 297 self.bad_problems = 0
58e64caf
AA
298 self.emails = []
299 self.mbox_file = '%s.bounced' % mbox_file
300 self.mbox = mailbox.mbox(self.mbox_file)
301 self.mbox.clear()
302
303 def process(self, message):
304 if message['X-Spam-Flag'] is None:
305 # During finalization, we will verifiy that all messages were processed
306 self.seen += 1
e0c82ac8
AA
307 # Special case: ignore mailman notifications for the mailing-list
308 # on which the NL is forwarded
6208fd26
NI
309 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
310 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
311 self.seen -= 1
312 return True
58e64caf 313 # Additionnal checks, just to be sure
e0c82ac8 314 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 315 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 316 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
317 else:
318 email = findAddressInBounce(message)
319 if email is not None:
320 self.emails.append(email)
321 self.mbox.add(message)
322 return True
323 else:
6208fd26
NI
324 print('! => No email found in direct bounce, this is really bad.')
325 self.bad_problems += 1
58e64caf
AA
326 return False
327
328 def finalize(self):
6208fd26
NI
329 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
330 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
331 print('They were saved in %s.' % self.mbox_file)
332 if self.bad_problems:
333 print('Found %d of them that are invalid.' % self.bad_problems)
334 if self.seen != len(self.mbox) + self.bad_problems:
335 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
336 print('')
337 print('Here is the list of email adresses for these bounces:')
338 print('')
58e64caf 339 for email in self.emails:
6208fd26
NI
340 print(email)
341 print('')
58e64caf
AA
342 self.mbox.close()
343
344#----------------------------------------------------------------------------#
345
346class SpamFilter(MboxFilter):
347
348 def initialize(self, mbox_file):
349 self.mbox_file = '%s.spam' % mbox_file
350 self.mbox = mailbox.mbox(self.mbox_file)
351 self.mbox.clear()
352
353 def process(self, message):
e0c82ac8
AA
354 if message['X-Spam-Flag'] is not None \
355 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
356 self.mbox.add(message)
357 return True
358 return False
359
360 def finalize(self):
6208fd26
NI
361 print('Found %d spams. This is reliable.' % len(self.mbox))
362 print('They were saved in %s.' % self.mbox_file)
363 print('You might check the contents of this mbox.')
58e64caf
AA
364 self.mbox.close()
365
366#----------------------------------------------------------------------------#
367
368class UnsureFilter(MboxFilter):
369
370 def initialize(self, mbox_file):
371 self.mbox_file = '%s.unsure' % mbox_file
372 self.mbox = mailbox.mbox(self.mbox_file)
373 self.mbox.clear()
374
375 def process(self, message):
e0c82ac8
AA
376 if message['X-Spam-Flag'] is not None \
377 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
378 self.mbox.add(message)
379 return True
380 return False
381
382 def finalize(self):
6208fd26
NI
383 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
384 print('They were saved in %s.' % self.mbox_file)
385 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
386 self.mbox.close()
387
388#----------------------------------------------------------------------------#
389
390class CheckNonSpamFilter(MboxFilter):
391
392 def initialize(self, mbox_file):
393 self.seen = 0
394
395 def process(self, message):
e0c82ac8
AA
396 if message['X-Spam-Flag'] is None \
397 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
398 self.seen += 1
399 return False
400
401 def finalize(self):
402 if self.seen > 0:
6208fd26
NI
403 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
404 print('Please investigate.')
58e64caf 405 else:
6208fd26 406 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
407
408#----------------------------------------------------------------------------#
409
410class OutOfOfficeFilter(MboxFilter):
411
412 def initialize(self, mbox_file):
413 self.mbox_file = '%s.ooo' % mbox_file
414 self.mbox = mailbox.mbox(self.mbox_file)
415 self.mbox.clear()
416 subject_re = [
417 r'^Absen(t|ce)',
6208fd26 418 r'^(AUTO: )?Out of (the )?office',
fc643049 419 r'^Auto( ?): ',
d0ce063e 420 r'^AutoRe( ?):',
6208fd26 421 r'^Automatic reply: ',
aa6c6ed4 422 r'automatique d\'absence',
aa6c6ed4 423 r'AutoReply',
6208fd26
NI
424 r'(est|is) absent',
425 r'I am out of town',
426 r'I am currently away',
fc643049 427 r'(am|is) out of (the )?office',
6208fd26
NI
428 r'Notification d\'absence',
429 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 430 ]
6208fd26 431 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
432
433 def process(self, message):
434 subject = findSubject(message)
435 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
436 self.mbox.add(message)
437 return True
fc643049
NI
438
439 # Some systems reply with "Re: ". Be smart here!
440 if subject is not None and subject.startswith('Re: '):
441 # Delivered-To: Autoresponder
442 if 'Autoresponder' in message.get_all('Delivered-To'):
443 self.mbox.add(message)
444 return True
445 # Parse content if it is simple enough
446 if message.get_content_type() == 'text/plain':
447 firstline = message.get_payload().splitlines()[0].lower()
448 if (' absent du bureau ' in firstline
449 or ' away from my office ' in firstline):
450 self.mbox.add(message)
451 return True
452
58e64caf
AA
453 return False
454
455 def finalize(self):
6208fd26
NI
456 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
457 print('They were saved in %s.' % self.mbox_file)
458 print('You may check the contents of this mbox.')
58e64caf
AA
459 self.mbox.close()
460
461#----------------------------------------------------------------------------#
462
463class DeliveryStatusNotificationFilter(MboxFilter):
464
465 def initialize(self, mbox_file):
466 self.emails = []
467 self.mbox_file = '%s.dsn' % mbox_file
468 self.mbox = mailbox.mbox(self.mbox_file)
469 self.mbox.clear()
6208fd26
NI
470 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
471 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
472 self.mbox_temp.clear()
58e64caf
AA
473
474 def process(self, message):
15f4834d
NI
475 # Don't modify message variable for "self.mbox.add(message)"
476 report_message = message
477 # Find real report inside attachment
478 if message.get_content_type() == 'multipart/mixed':
479 report_message = message.get_payload(0)
480
481 # Process report if its type is correct
482 if report_message.get_content_type() == 'multipart/report':
483 email = findAddressInBounce(report_message)
58e64caf
AA
484 if email is not None:
485 self.emails.append(email)
486 self.mbox.add(message)
6208fd26
NI
487 else:
488 print("! => Moved to temporary DSN mailbox")
489 self.mbox_temp.add(message)
15f4834d
NI
490 return True
491
492 # Detect ill-formatted reports, sent as plain text email
4cfc54b4
NI
493 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
494 email = findAddressInPlainBounce(report_message)
15f4834d
NI
495 if email is not None:
496 self.emails.append(email)
497 self.mbox.add(message)
6208fd26 498 return True
58e64caf
AA
499 return False
500
501 def finalize(self):
6208fd26
NI
502 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
503 print('They were saved in %s.' % self.mbox_file)
504 print('')
505 print('Here is the list of email adresses for these bounces:')
506 print('')
58e64caf 507 for email in self.emails:
6208fd26
NI
508 print(email)
509 print('')
58e64caf 510 self.mbox.close()
6208fd26
NI
511 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
512 print('They were saved in %s.' % self.mbox_temp_file)
513 self.mbox_temp.close()
58e64caf
AA
514
515#----------------------------------------------------------------------------#
516
517class CatchAllFilter(MboxFilter):
518
519 def initialize(self, mbox_file):
520 self.mbox_file = '%s.catchall' % mbox_file
521 self.mbox = mailbox.mbox(self.mbox_file)
522 self.mbox.clear()
523
524 def process(self, message):
525 self.mbox.add(message)
526 return True
527
528 def finalize(self):
529 if len(self.mbox) > 0:
6208fd26
NI
530 print('%d messages reached the catchall.' % len(self.mbox))
531 print('They were saved in %s.' % self.mbox_file)
532 print('You must process the contents of this mbox manually.')
58e64caf
AA
533 self.mbox.close()
534 else:
6208fd26 535 print('No messages reached the catchall. Nice.')
58e64caf
AA
536 self.mbox.close()
537 os.unlink(self.mbox_file)
538
539#----------------------------------------------------------------------------#
540
541if __name__ == '__main__':
542
543 if len(sys.argv) != 2:
6208fd26 544 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
545 sys.exit(1)
546
547 if not os.path.exists(sys.argv[1]):
6208fd26 548 print('No such file: %s' % sys.argv[1])
58e64caf
AA
549 sys.exit(1)
550
551 processor = MboxProcessor(sys.argv[1])
552 processor.run()