2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2
= re
.compile(r
'^local; ?(.+)$', re
.I | re
.U
)
130 def findAddressInBounce(bounce
):
131 """Finds the faulty email address in a bounced email.
133 See RFC 1894 for more information.
134 Returns None or the email address."""
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce
.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
140 # Extract the second component of the multipart/report
141 num_payloads
= len(bounce
.get_payload())
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
145 status
= bounce
.get_payload(1)
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status
.get_content_type() == 'message/rfc822':
150 text_bounce
= bounce
.get_payload(0)
151 if text_bounce
.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce
, bounce
)
153 # If it's not a text message, let's continue to the next error message
155 if status
.get_content_type() != 'message/delivery-status':
156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
158 # The per-message-fields don't matter here, get only the per-recipient-fields
159 num_payloads
= len(status
.get_payload())
161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
163 content
= status
.get_payload(1)
164 if content
.get_content_type() != 'text/plain':
165 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
167 # Extract the faulty email address
168 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
169 if recipient_match
is None:
170 # Be nice, test another regexp
171 recipient_match
= _recipient_re2
.search(content
['Final-Recipient'])
172 if recipient_match
is None:
173 print('! Missing final recipient.')
175 email
= recipient_match
.group(1)
176 # Check the action field
177 if content
['Action'].lower() != 'failed':
178 print('! Not a failed action (%s).' % content
['Action'])
181 status
= content
['Status']
182 diag_code
= content
['Diagnostic-Code']
184 # Permanent failure state
185 if int(status
[:1]) == 5:
188 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
189 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
193 "insufficient system storage",
195 "requested action aborted: local error in processing",
198 if 'quota' in status
.lower():
200 if diag_code
is not None:
201 ldiag_code
= diag_code
.lower()
202 if any(hint
in ldiag_code
for hint
in failure_hints
):
205 print('! Not a permanent failure status (%s).' % status
)
206 if diag_code
is not None:
207 print('! Diagnostic code was: %s' % diag_code
)
211 def findAddressInWeirdDeliveryStatus(message
):
212 """Finds the faulty email address in the delivery-status part of an email
214 Unlikely to findAddressInBounce, the status does NOT follow RFC 1894, so
215 try to learn to get data nevertheless...
216 Returns None or the email address.
218 if message
.get_content_type() != 'message/delivery-status':
219 print('! Not a valid weird bounce (expected message/delivery-status, found %s).' % message
.get_content_type())
221 # The per-message-fields don't matter here, get only the per-recipient-fields
222 num_payloads
= len(message
.get_payload())
224 print('! Not a valid weird bounce (expected at least 2 parts, found %d).' % num_payloads
)
226 content
= message
.get_payload(1)
227 # The content may be missing, but interesting headers still present in the first payload...
229 content
= message
.get_payload(0)
230 if 'Action' not in content
:
231 print('! Not a valid weird bounce (unable to find content).')
233 elif content
.get_content_type() != 'text/plain':
234 print('! Not a valid weird bounce (expected text/plain, found %s).' % content
.get_content_type())
237 # Extract the faulty email address
238 if 'Final-Recipient' in content
:
239 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
240 if recipient_match
is None:
241 # Be nice, test another regexp
242 recipient_match
= _recipient_re2
.search(content
['Final-Recipient'])
243 if recipient_match
is None:
244 print('! Unknown final recipient in weird bounce.')
246 email
= recipient_match
.group(1)
247 elif 'Original-Recipient' in content
:
248 recipient
= content
['Original-Recipient']
249 recipient_match
= _recipient_re
.search(recipient
)
250 if recipient_match
is None:
251 # Be nice, test another regexp
252 recipient_match
= _recipient_re2
.search(recipient
)
253 if recipient_match
is None:
254 recipient_match
= re
.match(r
'<([^>]+@[^@>]+)>', recipient
)
255 if recipient_match
is None:
256 print('! Unknown original recipient in weird bounce.')
258 email
= recipient_match
.group(1)
260 print('! Missing recipient in weird bounce.')
263 # Check the action field
264 if content
['Action'].lower() != 'failed':
265 print('! Not a failed action (%s).' % content
['Action'])
268 status
= content
['Status']
269 diag_code
= content
['Diagnostic-Code']
271 # Permanent failure state
272 if status
and int(status
[:1]) == 5:
275 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
276 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
280 "insufficient system storage",
282 "requested action aborted: local error in processing",
283 "sender address rejected",
286 if status
and 'quota' in status
.lower():
288 if diag_code
is not None:
289 ldiag_code
= diag_code
.lower()
290 if any(hint
in ldiag_code
for hint
in failure_hints
):
293 print('! Not a permanent failure status (%s).' % status
)
294 if diag_code
is not None:
295 print('! Diagnostic code was: %s' % diag_code
)
299 def findAddressInPlainBounce(bounce
, real_bounce
=None):
300 """Finds the faulty email address in a non-RFC-1894 bounced email
302 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
303 real_bounce
= real_bounce
or bounce
304 lower_from
= real_bounce
['From'].lower()
305 if 'mailer-daemon@' not in lower_from
and 'postmaster' not in lower_from
:
306 print('! Not a valid plain bounce (expected from MAILER-DAEMON or postmaster, found %s).' % bounce
['From'])
308 if bounce
.get_content_type() != 'text/plain':
309 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
311 subject
= findSubject(real_bounce
).lower()
313 "delivery status notification (failure)",
315 "mail delivery failure",
316 "returned mail: see transcript for details",
317 "undeliverable message",
318 "undelivered mail returned to sender",
320 if subject
not in known_subjects
and not subject
.startswith('mail delivery failed'):
321 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
324 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
325 lines
= bounce
.get_payload().splitlines()[:15]
327 # ALTOSPAM is a service which requires to click on a link when sending an email
328 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
329 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
330 if any("ALTOSPAM which is used by the person" in line
for line
in lines
):
331 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
335 # A message that you sent could not be delivered to one or more of its recipients.
336 # I'm afraid I wasn't able to deliver your message to the following addresses.
337 # The following message to <email@example.com> was undeliverable.
338 non_delivery_hints
= [
339 "could not be delivered to",
340 "Delivery to the following recipient failed permanently",
341 "I'm sorry to have to inform you that your message could not",
342 "I wasn't able to deliver your message",
343 "try to send your message again at a later time",
344 "> was undeliverable.",
345 "we were unable to deliver your message",
347 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
348 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
349 print('\n'.join(lines
))
353 # This is a permanent error; I've given up. Sorry it didn't work out.
354 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
355 permanent_error_hints
= [
356 "Delivery to the following recipient failed permanently",
357 "failed due to an unavailable mailbox",
358 "I'm sorry to have to inform you that your message could not",
359 "This is a permanent error",
360 "Unknown address error",
361 "unreachable for too long",
362 "550 Requested action not taken",
364 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
365 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
366 print('\n'.join(lines
))
369 # Retrieve the first occurence of <email@example.com>
371 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
373 match
= re
.match(r
'^\s*"?([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)"?\s*$', line
)
374 if match
is not None:
375 email
= match
.group(1)
376 if email
.endswith('@polytechnique.org'):
377 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
381 print('! Unknown mailer-daemon message, unable to find email address:')
382 print('\n'.join(lines
))
385 #----------------------------------------------------------------------------#
387 class DirectBouncesFilter(MboxFilter
):
389 def initialize(self
, mbox_file
):
391 self
.bad_problems
= 0
393 self
.mbox_file
= '%s.bounced' % mbox_file
394 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
397 def process(self
, message
):
398 if message
['X-Spam-Flag'] is None:
399 # During finalization, we will verifiy that all messages were processed
401 # Special case: ignore mailman notifications for the mailing-list
402 # on which the NL is forwarded
403 if message
['From'] == 'newsletter-externes-owner@polytechnique.org':
404 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
407 # Additionnal checks, just to be sure
408 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
409 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
410 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
412 email
= findAddressInBounce(message
)
413 if email
is not None:
414 self
.emails
.append(email
)
415 self
.mbox
.add(message
)
418 print('! => No email found in direct bounce, this is really bad.')
419 self
.bad_problems
+= 1
423 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
424 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
425 print('They were saved in %s.' % self
.mbox_file
)
426 if self
.bad_problems
:
427 print('Found %d of them that are invalid.' % self
.bad_problems
)
428 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
429 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
431 print('Here is the list of email adresses for these bounces:')
433 for email
in self
.emails
:
438 #----------------------------------------------------------------------------#
440 class SpamFilter(MboxFilter
):
442 def initialize(self
, mbox_file
):
443 self
.mbox_file
= '%s.spam' % mbox_file
444 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
447 def process(self
, message
):
448 if message
['X-Spam-Flag'] is not None \
449 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
450 self
.mbox
.add(message
)
455 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
456 print('They were saved in %s.' % self
.mbox_file
)
457 print('You might check the contents of this mbox.')
460 #----------------------------------------------------------------------------#
462 class UnsureFilter(MboxFilter
):
464 def initialize(self
, mbox_file
):
465 self
.mbox_file
= '%s.unsure' % mbox_file
466 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
469 def process(self
, message
):
470 if message
['X-Spam-Flag'] is not None \
471 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
472 self
.mbox
.add(message
)
477 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
478 print('They were saved in %s.' % self
.mbox_file
)
479 print('You must check the contents of this mbox and feed the antispam.')
482 #----------------------------------------------------------------------------#
484 class CheckNonSpamFilter(MboxFilter
):
486 def initialize(self
, mbox_file
):
489 def process(self
, message
):
490 if message
['X-Spam-Flag'] is None \
491 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
497 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
498 print('Please investigate.')
500 print('All messages were either spam, or unsure, or non-spams. Good.')
502 #----------------------------------------------------------------------------#
504 class OutOfOfficeFilter(MboxFilter
):
506 def initialize(self
, mbox_file
):
507 self
.mbox_file
= '%s.ooo' % mbox_file
508 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
512 r
'^(AUTO: )?Out of (the )?office',
515 r
'^Automatic reply: ',
516 r
'automatique d\'absence
',
519 r'^En dehors du bureau
',
521 r'I am currently away
',
522 r'(am|
is) out
of (the
)?office
',
523 r'Notification d
\'absence
',
524 r'^Out of email reach
',
525 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
526 r'^Respuesta de Estoy ausente
:',
528 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
530 def process(self, message):
531 subject = findSubject(message)
532 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
533 self.mbox.add(message)
536 # Some systems reply with "Re: ". Be smart here!
537 if subject is not None and subject.startswith('Re
: '):
538 # Delivered-To: Autoresponder
539 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
540 self.mbox.add(message)
542 # Parse content if it is simple enough
543 if message.get_content_type() == 'text
/plain
':
544 firstline = message.get_payload().splitlines()[0].lower()
545 if (' absent du bureau
' in firstline
546 or ' away
from my office
' in firstline):
547 self.mbox.add(message)
553 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
554 print('They were saved
in %s
.' % self.mbox_file)
555 print('You may check the contents of this mbox
.')
558 #----------------------------------------------------------------------------#
560 class DeliveryStatusNotificationFilter(MboxFilter):
562 def initialize(self, mbox_file):
564 self.mbox_file = '%s
.dsn
' % mbox_file
565 self.mbox = mailbox.mbox(self.mbox_file)
567 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
568 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
569 self.mbox_temp.clear()
571 def process(self, message):
572 # Don't modify message variable
for "self.mbox.add(message)"
573 report_message
= message
574 # Find real report inside attachment
575 if message
.get_content_type() == 'multipart/mixed':
576 # Some MTA confuse multipart/mixed with multipart/report
577 # Let's try to find a report!
578 if len(message
.get_payload()) >= 2:
579 try_status
= message
.get_payload(1)
580 if try_status
.get_content_type() == 'message/delivery-status':
581 # The world would be a nice place if delivery-status were
582 # formatted as expected...
583 email
= findAddressInWeirdDeliveryStatus(try_status
)
584 if email
is not None:
585 self
.emails
.append(email
)
586 self
.mbox
.add(message
)
589 report_message
= message
.get_payload(0)
591 # Process report if its type is correct
592 if report_message
.get_content_type() == 'multipart/report':
593 email
= findAddressInBounce(report_message
)
594 if email
is not None:
595 self
.emails
.append(email
)
596 self
.mbox
.add(message
)
598 print("! => Moved to temporary DSN mailbox")
599 self
.mbox_temp
.add(message
)
602 # Detect ill-formatted reports, sent as plain text email
603 if report_message
.get_content_type() == 'text/plain' and (
604 'MAILER-DAEMON@' in message
['From'].upper() or
605 'mail delivery failure' == message
['Subject'].lower()
607 email
= findAddressInPlainBounce(report_message
)
608 if email
is not None:
609 self
.emails
.append(email
)
610 self
.mbox
.add(message
)
615 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
616 print('They were saved in %s.' % self
.mbox_file
)
618 print('Here is the list of email adresses for these bounces:')
620 for email
in self
.emails
:
624 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
625 print('They were saved in %s.' % self
.mbox_temp_file
)
626 self
.mbox_temp
.close()
628 #----------------------------------------------------------------------------#
630 class CatchAllFilter(MboxFilter
):
632 def initialize(self
, mbox_file
):
633 self
.mbox_file
= '%s.catchall' % mbox_file
634 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
637 def process(self
, message
):
638 self
.mbox
.add(message
)
642 if len(self
.mbox
) > 0:
643 print('%d messages reached the catchall.' %
len(self
.mbox
))
644 print('They were saved in %s.' % self
.mbox_file
)
645 print('You must process the contents of this mbox manually.')
648 print('No messages reached the catchall. Nice.')
650 os
.unlink(self
.mbox_file
)
652 #----------------------------------------------------------------------------#
654 if __name__
== '__main__':
656 if len(sys
.argv
) != 2:
657 print('Usage: %s mbox' % sys
.argv
[0])
660 if not os
.path
.exists(sys
.argv
[1]):
661 print('No such file: %s' % sys
.argv
[1])
664 processor
= MboxProcessor(sys
.argv
[1])