NL bounces: add ALTOSPAM detection
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
8c9c7d77
NI
126# Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127_recipient_re2 = re.compile(r'^local; ?(.+)$', re.I | re.U)
58e64caf 128
6208fd26 129
58e64caf
AA
130def findAddressInBounce(bounce):
131 """Finds the faulty email address in a bounced email.
6208fd26 132
58e64caf
AA
133 See RFC 1894 for more information.
134 Returns None or the email address."""
6208fd26 135
58e64caf
AA
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce.get_content_type() != 'multipart/report':
6208fd26 138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
139 return None
140 # Extract the second component of the multipart/report
aa6c6ed4
AA
141 num_payloads = len(bounce.get_payload())
142 if num_payloads < 2:
6208fd26 143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
144 return None
145 status = bounce.get_payload(1)
146 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 147 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
148 return None
149 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
150 num_payloads = len(status.get_payload())
151 if num_payloads < 2:
6208fd26 152 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
153 return None
154 content = status.get_payload(1)
155 if content.get_content_type() != 'text/plain':
4aad6c9c 156 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
157 return None
158 # Extract the faulty email address
159 recipient_match = _recipient_re.search(content['Final-Recipient'])
160 if recipient_match is None:
8c9c7d77
NI
161 # Be nice, test another regexp
162 recipient_match = _recipient_re2.search(content['Final-Recipient'])
163 if recipient_match is None:
164 print('! Missing final recipient.')
165 return None
58e64caf
AA
166 email = recipient_match.group(1)
167 # Check the action field
4aad6c9c 168 if content['Action'].lower() != 'failed':
6208fd26 169 print('! Not a failed action (%s).' % content['Action'])
58e64caf 170 return None
29c6ffa5
NI
171
172 status = content['Status']
173 diag_code = content['Diagnostic-Code']
174
175 # Permanent failure state
176 if int(status[:1]) == 5:
177 return email
178
58e64caf 179 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
29c6ffa5
NI
180 if diag_code is not None and diag_code.startswith('X-Postfix'):
181 return email
182
183 failure_hints = [
184 "insufficient system storage",
185 "mailbox full",
186 "user unknown",
187 ]
188 if 'quota' in status.lower():
189 return email
190 if diag_code is not None:
191 ldiag_code = diag_code.lower()
192 if any(hint in ldiag_code for hint in failure_hints):
193 return email
194
195 print('! Not a permanent failure status (%s).' % status)
196 if diag_code is not None:
197 print('! Diagnostic code was: %s' % diag_code)
198 return None
58e64caf 199
15f4834d
NI
200
201def findAddressInPlainBounce(bounce):
202 """Finds the faulty email address in a non-RFC-1894 bounced email
203 """
204 if 'MAILER-DAEMON@' not in bounce['From'].upper():
205 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce['From'])
206 return None
207 if bounce.get_content_type() != 'text/plain':
208 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce.get_content_type())
209 return None
210 subject = findSubject(bounce).lower()
211 if (subject != 'failure notice'
212 and subject != 'undeliverable message'
213 and not subject.startswith('mail delivery failed')
214 and subject != 'delivery status notification (failure)'):
215
216 print('! Not a valid plain bounce (unknown subject: %s).' % subject)
217 return None
218
219 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
220 lines = bounce.get_payload().splitlines()[:15]
221
222984e4
NI
222 # ALTOSPAM is a service which requires to click on a link when sending an email
223 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
224 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
225 if any("ALTOSPAM which is used by the person" in line for line in lines):
226 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
227 return None
228
15f4834d
NI
229 # Match:
230 # A message that you sent could not be delivered to one or more of its recipients.
231 # I'm afraid I wasn't able to deliver your message to the following addresses.
232 # The following message to <email@example.com> was undeliverable.
233 non_delivery_hints = [
234 "Delivery to the following recipient failed permanently",
235 "I wasn't able to deliver your message",
236 "> was undeliverable.",
237 "could not be delivered to",
238 "we were unable to deliver your message",
239 ]
240 if not any(any(hint in line for hint in non_delivery_hints) for line in lines):
241 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
242 print('\n'.join(lines))
243 return None
244
245 # Match:
246 # This is a permanent error; I've given up. Sorry it didn't work out.
247 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
248 permanent_error_hints = [
249 "Delivery to the following recipient failed permanently",
250 "This is a permanent error",
251 "Unknown address error",
252 "550 Requested action not taken",
253 ]
254 if not any(any(hint in line for hint in permanent_error_hints) for line in lines):
255 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
256 print('\n'.join(lines))
257 return None
258
259 # Retrieve the first occurence of <email@example.com>
260 for line in lines:
261 match = re.match(r'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line)
262 if match is None:
263 match = re.match(r'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line)
264 if match is not None:
265 email = match.group(1)
266 if email.endswith('@polytechnique.org'):
267 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
268 break
269 return email
270
271 print('! Unknown mailer-daemon message, unable to find email address:')
272 print('\n'.join(lines))
273 return None
274
58e64caf
AA
275#----------------------------------------------------------------------------#
276
277class DirectBouncesFilter(MboxFilter):
278
279 def initialize(self, mbox_file):
280 self.seen = 0
6208fd26 281 self.bad_problems = 0
58e64caf
AA
282 self.emails = []
283 self.mbox_file = '%s.bounced' % mbox_file
284 self.mbox = mailbox.mbox(self.mbox_file)
285 self.mbox.clear()
286
287 def process(self, message):
288 if message['X-Spam-Flag'] is None:
289 # During finalization, we will verifiy that all messages were processed
290 self.seen += 1
e0c82ac8
AA
291 # Special case: ignore mailman notifications for the mailing-list
292 # on which the NL is forwarded
6208fd26
NI
293 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
294 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
295 self.seen -= 1
296 return True
58e64caf 297 # Additionnal checks, just to be sure
e0c82ac8 298 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 299 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 300 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
301 else:
302 email = findAddressInBounce(message)
303 if email is not None:
304 self.emails.append(email)
305 self.mbox.add(message)
306 return True
307 else:
6208fd26
NI
308 print('! => No email found in direct bounce, this is really bad.')
309 self.bad_problems += 1
58e64caf
AA
310 return False
311
312 def finalize(self):
6208fd26
NI
313 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
314 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
315 print('They were saved in %s.' % self.mbox_file)
316 if self.bad_problems:
317 print('Found %d of them that are invalid.' % self.bad_problems)
318 if self.seen != len(self.mbox) + self.bad_problems:
319 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
320 print('')
321 print('Here is the list of email adresses for these bounces:')
322 print('')
58e64caf 323 for email in self.emails:
6208fd26
NI
324 print(email)
325 print('')
58e64caf
AA
326 self.mbox.close()
327
328#----------------------------------------------------------------------------#
329
330class SpamFilter(MboxFilter):
331
332 def initialize(self, mbox_file):
333 self.mbox_file = '%s.spam' % mbox_file
334 self.mbox = mailbox.mbox(self.mbox_file)
335 self.mbox.clear()
336
337 def process(self, message):
e0c82ac8
AA
338 if message['X-Spam-Flag'] is not None \
339 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
340 self.mbox.add(message)
341 return True
342 return False
343
344 def finalize(self):
6208fd26
NI
345 print('Found %d spams. This is reliable.' % len(self.mbox))
346 print('They were saved in %s.' % self.mbox_file)
347 print('You might check the contents of this mbox.')
58e64caf
AA
348 self.mbox.close()
349
350#----------------------------------------------------------------------------#
351
352class UnsureFilter(MboxFilter):
353
354 def initialize(self, mbox_file):
355 self.mbox_file = '%s.unsure' % mbox_file
356 self.mbox = mailbox.mbox(self.mbox_file)
357 self.mbox.clear()
358
359 def process(self, message):
e0c82ac8
AA
360 if message['X-Spam-Flag'] is not None \
361 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
362 self.mbox.add(message)
363 return True
364 return False
365
366 def finalize(self):
6208fd26
NI
367 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
368 print('They were saved in %s.' % self.mbox_file)
369 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
370 self.mbox.close()
371
372#----------------------------------------------------------------------------#
373
374class CheckNonSpamFilter(MboxFilter):
375
376 def initialize(self, mbox_file):
377 self.seen = 0
378
379 def process(self, message):
e0c82ac8
AA
380 if message['X-Spam-Flag'] is None \
381 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
382 self.seen += 1
383 return False
384
385 def finalize(self):
386 if self.seen > 0:
6208fd26
NI
387 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
388 print('Please investigate.')
58e64caf 389 else:
6208fd26 390 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
391
392#----------------------------------------------------------------------------#
393
394class OutOfOfficeFilter(MboxFilter):
395
396 def initialize(self, mbox_file):
397 self.mbox_file = '%s.ooo' % mbox_file
398 self.mbox = mailbox.mbox(self.mbox_file)
399 self.mbox.clear()
400 subject_re = [
401 r'^Absen(t|ce)',
6208fd26 402 r'^(AUTO: )?Out of (the )?office',
fc643049 403 r'^Auto( ?): ',
d0ce063e 404 r'^AutoRe( ?):',
6208fd26 405 r'^Automatic reply: ',
aa6c6ed4 406 r'automatique d\'absence',
aa6c6ed4 407 r'AutoReply',
6208fd26
NI
408 r'(est|is) absent',
409 r'I am out of town',
410 r'I am currently away',
fc643049 411 r'(am|is) out of (the )?office',
6208fd26
NI
412 r'Notification d\'absence',
413 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 414 ]
6208fd26 415 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
416
417 def process(self, message):
418 subject = findSubject(message)
419 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
420 self.mbox.add(message)
421 return True
fc643049
NI
422
423 # Some systems reply with "Re: ". Be smart here!
424 if subject is not None and subject.startswith('Re: '):
425 # Delivered-To: Autoresponder
426 if 'Autoresponder' in message.get_all('Delivered-To'):
427 self.mbox.add(message)
428 return True
429 # Parse content if it is simple enough
430 if message.get_content_type() == 'text/plain':
431 firstline = message.get_payload().splitlines()[0].lower()
432 if (' absent du bureau ' in firstline
433 or ' away from my office ' in firstline):
434 self.mbox.add(message)
435 return True
436
58e64caf
AA
437 return False
438
439 def finalize(self):
6208fd26
NI
440 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
441 print('They were saved in %s.' % self.mbox_file)
442 print('You may check the contents of this mbox.')
58e64caf
AA
443 self.mbox.close()
444
445#----------------------------------------------------------------------------#
446
447class DeliveryStatusNotificationFilter(MboxFilter):
448
449 def initialize(self, mbox_file):
450 self.emails = []
451 self.mbox_file = '%s.dsn' % mbox_file
452 self.mbox = mailbox.mbox(self.mbox_file)
453 self.mbox.clear()
6208fd26
NI
454 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
455 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
456 self.mbox_temp.clear()
58e64caf
AA
457
458 def process(self, message):
15f4834d
NI
459 # Don't modify message variable for "self.mbox.add(message)"
460 report_message = message
461 # Find real report inside attachment
462 if message.get_content_type() == 'multipart/mixed':
463 report_message = message.get_payload(0)
464
465 # Process report if its type is correct
466 if report_message.get_content_type() == 'multipart/report':
467 email = findAddressInBounce(report_message)
58e64caf
AA
468 if email is not None:
469 self.emails.append(email)
470 self.mbox.add(message)
6208fd26
NI
471 else:
472 print("! => Moved to temporary DSN mailbox")
473 self.mbox_temp.add(message)
15f4834d
NI
474 return True
475
476 # Detect ill-formatted reports, sent as plain text email
4cfc54b4
NI
477 if 'MAILER-DAEMON@' in message['From'].upper() and report_message.get_content_type() == 'text/plain':
478 email = findAddressInPlainBounce(report_message)
15f4834d
NI
479 if email is not None:
480 self.emails.append(email)
481 self.mbox.add(message)
6208fd26 482 return True
58e64caf
AA
483 return False
484
485 def finalize(self):
6208fd26
NI
486 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
487 print('They were saved in %s.' % self.mbox_file)
488 print('')
489 print('Here is the list of email adresses for these bounces:')
490 print('')
58e64caf 491 for email in self.emails:
6208fd26
NI
492 print(email)
493 print('')
58e64caf 494 self.mbox.close()
6208fd26
NI
495 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
496 print('They were saved in %s.' % self.mbox_temp_file)
497 self.mbox_temp.close()
58e64caf
AA
498
499#----------------------------------------------------------------------------#
500
501class CatchAllFilter(MboxFilter):
502
503 def initialize(self, mbox_file):
504 self.mbox_file = '%s.catchall' % mbox_file
505 self.mbox = mailbox.mbox(self.mbox_file)
506 self.mbox.clear()
507
508 def process(self, message):
509 self.mbox.add(message)
510 return True
511
512 def finalize(self):
513 if len(self.mbox) > 0:
6208fd26
NI
514 print('%d messages reached the catchall.' % len(self.mbox))
515 print('They were saved in %s.' % self.mbox_file)
516 print('You must process the contents of this mbox manually.')
58e64caf
AA
517 self.mbox.close()
518 else:
6208fd26 519 print('No messages reached the catchall. Nice.')
58e64caf
AA
520 self.mbox.close()
521 os.unlink(self.mbox_file)
522
523#----------------------------------------------------------------------------#
524
525if __name__ == '__main__':
526
527 if len(sys.argv) != 2:
6208fd26 528 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
529 sys.exit(1)
530
531 if not os.path.exists(sys.argv[1]):
6208fd26 532 print('No such file: %s' % sys.argv[1])
58e64caf
AA
533 sys.exit(1)
534
535 processor = MboxProcessor(sys.argv[1])
536 processor.run()