2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2014 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2
= re
.compile(r
'^local; ?(.+)$', re
.I | re
.U
)
130 def findAddressInBounce(bounce
):
131 """Finds the faulty email address in a bounced email.
133 See RFC 1894 for more information.
134 Returns None or the email address."""
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce
.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
140 # Extract the second component of the multipart/report
141 num_payloads
= len(bounce
.get_payload())
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
145 status
= bounce
.get_payload(1)
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status
.get_content_type() == 'message/rfc822':
150 text_bounce
= bounce
.get_payload(0)
151 if text_bounce
.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce
, bounce
)
153 # If it's not a text message, let's continue to the next error message
155 if status
.get_content_type() != 'message/delivery-status':
156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
158 # The per-message-fields don't matter here, get only the per-recipient-fields
159 num_payloads
= len(status
.get_payload())
161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
163 content
= status
.get_payload(1)
164 if content
.get_content_type() != 'text/plain':
165 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
167 # Extract the faulty email address
168 # Some MTA don't set Final-Recipient but use Remote-Recipient instead
169 if 'Final-Recipient' in content
:
170 final_recipient
= content
['Final-Recipient']
171 elif 'Remote-Recipient' in content
:
172 final_recipient
= content
['Remote-Recipient']
174 print('! Not a valid bounce (no Final-Recipient).')
176 recipient_match
= _recipient_re
.search(final_recipient
)
177 if recipient_match
is None:
178 # Be nice, test another regexp
179 recipient_match
= _recipient_re2
.search(final_recipient
)
180 if recipient_match
is None:
181 print('! Missing final recipient.')
183 email
= recipient_match
.group(1)
184 # Check the action field
185 if content
['Action'].lower().strip() != 'failed':
186 print('! Not a failed action (%s).' % content
['Action'])
189 status
= content
['Status']
190 diag_code
= content
['Diagnostic-Code']
192 # Permanent failure state
193 if int(status
[:1]) == 5:
196 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
197 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
201 "insufficient system storage",
203 "mailbox recipient does not have a mailbox database",
205 "requested action aborted: local error in processing",
208 if 'quota' in status
.lower():
210 if diag_code
is not None:
211 ldiag_code
= diag_code
.lower()
212 if any(hint
in ldiag_code
for hint
in failure_hints
):
215 print('! Not a permanent failure status (%s).' % status
)
216 if diag_code
is not None:
217 print('! Diagnostic code was: %s' % diag_code
)
221 def findAddressInWeirdDeliveryStatus(message
):
222 """Finds the faulty email address in the delivery-status part of an email
224 Unlikely to findAddressInBounce, the status does NOT follow RFC 1894, so
225 try to learn to get data nevertheless...
226 Returns None or the email address.
228 if message
.get_content_type() != 'message/delivery-status':
229 print('! Not a valid weird bounce (expected message/delivery-status, found %s).' % message
.get_content_type())
231 # The per-message-fields don't matter here, get only the per-recipient-fields
232 num_payloads
= len(message
.get_payload())
234 print('! Not a valid weird bounce (expected at least 2 parts, found %d).' % num_payloads
)
236 content
= message
.get_payload(1)
237 # The content may be missing, but interesting headers still present in the first payload...
239 content
= message
.get_payload(0)
240 if 'Action' not in content
:
241 print('! Not a valid weird bounce (unable to find content).')
243 elif content
.get_content_type() != 'text/plain':
244 print('! Not a valid weird bounce (expected text/plain, found %s).' % content
.get_content_type())
247 # Extract the faulty email address
248 if 'Final-Recipient' in content
:
249 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
250 if recipient_match
is None:
251 # Be nice, test another regexp
252 recipient_match
= _recipient_re2
.search(content
['Final-Recipient'])
253 if recipient_match
is None:
254 print('! Unknown final recipient in weird bounce.')
256 email
= recipient_match
.group(1)
257 elif 'Original-Recipient' in content
:
258 recipient
= content
['Original-Recipient']
259 recipient_match
= _recipient_re
.search(recipient
)
260 if recipient_match
is None:
261 # Be nice, test another regexp
262 recipient_match
= _recipient_re2
.search(recipient
)
263 if recipient_match
is None:
264 recipient_match
= re
.match(r
'<([^>]+@[^@>]+)>', recipient
)
265 if recipient_match
is None:
266 print('! Unknown original recipient in weird bounce.')
268 email
= recipient_match
.group(1)
270 print('! Missing recipient in weird bounce.')
273 # Check the action field
274 if content
['Action'].lower() != 'failed':
275 print('! Not a failed action (%s).' % content
['Action'])
278 status
= content
['Status']
279 diag_code
= content
['Diagnostic-Code']
281 # Permanent failure state
282 if status
and int(status
[:1]) == 5:
285 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
286 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
290 "insufficient system storage",
292 "requested action aborted: local error in processing",
293 "sender address rejected",
296 if status
and 'quota' in status
.lower():
298 if diag_code
is not None:
299 ldiag_code
= diag_code
.lower()
300 if any(hint
in ldiag_code
for hint
in failure_hints
):
303 print('! Not a permanent failure status (%s).' % status
)
304 if diag_code
is not None:
305 print('! Diagnostic code was: %s' % diag_code
)
309 def findAddressInPlainBounce(bounce
, real_bounce
=None):
310 """Finds the faulty email address in a non-RFC-1894 bounced email
312 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
313 real_bounce
= real_bounce
or bounce
314 lower_from
= real_bounce
['From'].lower()
315 if 'mailer-daemon@' not in lower_from
and 'postmaster' not in lower_from
:
316 print('! Not a valid plain bounce (expected from MAILER-DAEMON or postmaster, found %s).' % bounce
['From'])
318 if bounce
.get_content_type() != 'text/plain':
319 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
321 subject
= findSubject(real_bounce
).lower()
323 "delivery status notification (failure)",
325 "mail delivery failure",
326 "returned mail: see transcript for details",
327 "undeliverable message",
328 "undelivered mail returned to sender",
330 if subject
not in known_subjects
and not subject
.startswith('mail delivery failed'):
331 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
334 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
335 lines
= bounce
.get_payload().splitlines()[:15]
337 # ALTOSPAM is a service which requires to click on a link when sending an email
338 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
339 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
340 if any("ALTOSPAM which is used by the person" in line
for line
in lines
):
341 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
345 # A message that you sent could not be delivered to one or more of its recipients.
346 # I'm afraid I wasn't able to deliver your message to the following addresses.
347 # The following message to <email@example.com> was undeliverable.
348 non_delivery_hints
= [
349 "could not be delivered to",
350 "Delivery to the following recipient failed permanently",
351 "I'm sorry to have to inform you that your message could not",
352 "I wasn't able to deliver your message",
353 "try to send your message again at a later time",
354 "User unknown in local recipient table",
355 "> was undeliverable.",
356 "we were unable to deliver your message",
358 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
359 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
360 print('\n'.join(lines
))
364 # This is a permanent error; I've given up. Sorry it didn't work out.
365 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
366 permanent_error_hints
= [
367 "Delivery to the following recipient failed permanently",
368 "failed due to an unavailable mailbox",
369 "following addresses had permanent fatal errors",
370 "I'm sorry to have to inform you that your message could not",
371 "The email account that you tried to reach does not exist",
372 "This is a permanent error",
373 "Unknown address error",
374 "unreachable for too long",
375 "550 Requested action not taken",
377 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
378 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
379 print('\n'.join(lines
))
382 # Retrieve the first occurence of <email@example.com>
384 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
386 match
= re
.match(r
'^\s*"?([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)"?\s*$', line
)
387 if match
is not None:
388 email
= match
.group(1)
389 if email
.endswith('@polytechnique.org'):
390 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
394 print('! Unknown mailer-daemon message, unable to find email address:')
395 print('\n'.join(lines
))
398 #----------------------------------------------------------------------------#
400 class DirectBouncesFilter(MboxFilter
):
402 def initialize(self
, mbox_file
):
404 self
.bad_problems
= 0
406 self
.mbox_file
= '%s.bounced' % mbox_file
407 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
410 def process(self
, message
):
411 if message
['X-Spam-Flag'] is None:
412 # During finalization, we will verifiy that all messages were processed
414 # Special case: ignore mailman notifications for the mailing-list
415 # on which the NL is forwarded
416 if message
['From'] == 'newsletter-externes-owner@polytechnique.org':
417 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
420 # Additionnal checks, just to be sure
421 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
422 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
423 print('! Not an usual direct bounce (From=%r, Subject=%r).' %
(message
['From'], message
['Subject']))
425 email
= findAddressInBounce(message
)
426 if email
is not None:
427 self
.emails
.append(email
)
428 self
.mbox
.add(message
)
431 print('! => No email found in direct bounce, this is really bad.')
432 self
.bad_problems
+= 1
436 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
437 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
438 print('They were saved in %s.' % self
.mbox_file
)
439 if self
.bad_problems
:
440 print('Found %d of them that are invalid.' % self
.bad_problems
)
441 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
442 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
444 print('Here is the list of email adresses for these bounces:')
446 for email
in self
.emails
:
451 #----------------------------------------------------------------------------#
453 class SpamFilter(MboxFilter
):
455 def initialize(self
, mbox_file
):
456 self
.mbox_file
= '%s.spam' % mbox_file
457 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
460 def process(self
, message
):
461 if message
['X-Spam-Flag'] is not None \
462 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
463 self
.mbox
.add(message
)
468 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
469 print('They were saved in %s.' % self
.mbox_file
)
470 print('You might check the contents of this mbox.')
473 #----------------------------------------------------------------------------#
475 class UnsureFilter(MboxFilter
):
477 def initialize(self
, mbox_file
):
478 self
.mbox_file
= '%s.unsure' % mbox_file
479 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
482 def process(self
, message
):
483 if message
['X-Spam-Flag'] is not None \
484 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
485 self
.mbox
.add(message
)
490 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
491 print('They were saved in %s.' % self
.mbox_file
)
492 print('You must check the contents of this mbox and feed the antispam.')
495 #----------------------------------------------------------------------------#
497 class CheckNonSpamFilter(MboxFilter
):
499 def initialize(self
, mbox_file
):
502 def process(self
, message
):
503 if message
['X-Spam-Flag'] is None \
504 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
510 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
511 print('Please investigate.')
513 print('All messages were either spam, or unsure, or non-spams. Good.')
515 #----------------------------------------------------------------------------#
517 class OutOfOfficeFilter(MboxFilter
):
519 def initialize(self
, mbox_file
):
520 self
.mbox_file
= '%s.ooo' % mbox_file
521 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
525 r
'^(AUTO: )?Out of (the )?office',
528 r
'^Automatic reply: ',
529 r
'automatique d\'absence
',
532 r'^En dehors du bureau
',
534 r'I am currently away
',
535 r'(am|
is) out
of (the
)?office
',
536 r'Notification d
\'absence
',
537 r'^Out of email reach
',
538 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
539 r'^Respuesta de Estoy ausente
:',
541 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
543 def process(self, message):
544 subject = findSubject(message)
545 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
546 self.mbox.add(message)
549 # Some systems reply with "Re: ". Be smart here!
550 if subject is not None and subject.startswith('Re
: '):
551 # Delivered-To: Autoresponder
552 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
553 self.mbox.add(message)
555 # Parse content if it is simple enough
556 if message.get_content_type() == 'text
/plain
':
557 firstline = message.get_payload().splitlines()[0].lower()
558 if (' absent du bureau
' in firstline
559 or ' away
from my office
' in firstline):
560 self.mbox.add(message)
566 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
567 print('They were saved
in %s
.' % self.mbox_file)
568 print('You may check the contents of this mbox
.')
571 #----------------------------------------------------------------------------#
573 class DeliveryStatusNotificationFilter(MboxFilter):
575 def initialize(self, mbox_file):
577 self.mbox_file = '%s
.dsn
' % mbox_file
578 self.mbox = mailbox.mbox(self.mbox_file)
580 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
581 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
582 self.mbox_temp.clear()
584 def process(self, message):
585 # Don't modify message variable
for "self.mbox.add(message)"
586 report_message
= message
587 # Find real report inside attachment
588 if message
.get_content_type() == 'multipart/mixed':
589 # Some MTA confuse multipart/mixed with multipart/report
590 # Let's try to find a report!
591 if len(message
.get_payload()) >= 2:
592 try_status
= message
.get_payload(1)
593 if try_status
.get_content_type() == 'message/delivery-status':
594 # The world would be a nice place if delivery-status were
595 # formatted as expected...
596 email
= findAddressInWeirdDeliveryStatus(try_status
)
597 if email
is not None:
598 self
.emails
.append(email
)
599 self
.mbox
.add(message
)
602 report_message
= message
.get_payload(0)
604 # Process report if its type is correct
605 if report_message
.get_content_type() == 'multipart/report':
606 email
= findAddressInBounce(report_message
)
607 if email
is not None:
608 self
.emails
.append(email
)
609 self
.mbox
.add(message
)
611 print("! => Moved to temporary DSN mailbox")
612 self
.mbox_temp
.add(message
)
615 # Detect ill-formatted reports, sent as plain text email
616 if report_message
.get_content_type() == 'text/plain' and (
617 'MAILER-DAEMON@' in message
.get('From', '').upper() or
618 'mail delivery failure' == message
.get('Subject', '').lower()
620 email
= findAddressInPlainBounce(report_message
)
621 if email
is not None:
622 self
.emails
.append(email
)
623 self
.mbox
.add(message
)
628 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
629 print('They were saved in %s.' % self
.mbox_file
)
631 print('Here is the list of email adresses for these bounces:')
633 for email
in self
.emails
:
637 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
638 print('They were saved in %s.' % self
.mbox_temp_file
)
639 self
.mbox_temp
.close()
641 #----------------------------------------------------------------------------#
643 class CatchAllFilter(MboxFilter
):
645 def initialize(self
, mbox_file
):
646 self
.mbox_file
= '%s.catchall' % mbox_file
647 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
650 def process(self
, message
):
651 self
.mbox
.add(message
)
655 if len(self
.mbox
) > 0:
656 print('%d messages reached the catchall.' %
len(self
.mbox
))
657 print('They were saved in %s.' % self
.mbox_file
)
658 print('You must process the contents of this mbox manually.')
661 print('No messages reached the catchall. Nice.')
663 os
.unlink(self
.mbox_file
)
665 #----------------------------------------------------------------------------#
667 if __name__
== '__main__':
669 if len(sys
.argv
) != 2:
670 print('Usage: %s mbox' % sys
.argv
[0])
673 if not os
.path
.exists(sys
.argv
[1]):
674 print('No such file: %s' % sys
.argv
[1])
677 processor
= MboxProcessor(sys
.argv
[1])