2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2
= re
.compile(r
'^local; ?(.+)$', re
.I | re
.U
)
130 def findAddressInBounce(bounce
):
131 """Finds the faulty email address in a bounced email.
133 See RFC 1894 for more information.
134 Returns None or the email address."""
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce
.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
140 # Extract the second component of the multipart/report
141 num_payloads
= len(bounce
.get_payload())
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
145 status
= bounce
.get_payload(1)
147 # If the second part is of type "message/rfc822" it is the undelivered message.
148 # Let's try to understand the text part
149 if status
.get_content_type() == 'message/rfc822':
150 text_bounce
= bounce
.get_payload(0)
151 if text_bounce
.get_content_type() == 'text/plain':
152 return findAddressInPlainBounce(text_bounce
, bounce
)
153 # If it's not a text message, let's continue to the next error message
155 if status
.get_content_type() != 'message/delivery-status':
156 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
158 # The per-message-fields don't matter here, get only the per-recipient-fields
159 num_payloads
= len(status
.get_payload())
161 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
163 content
= status
.get_payload(1)
164 if content
.get_content_type() != 'text/plain':
165 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
167 # Extract the faulty email address
168 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
169 if recipient_match
is None:
170 # Be nice, test another regexp
171 recipient_match
= _recipient_re2
.search(content
['Final-Recipient'])
172 if recipient_match
is None:
173 print('! Missing final recipient.')
175 email
= recipient_match
.group(1)
176 # Check the action field
177 if content
['Action'].lower() != 'failed':
178 print('! Not a failed action (%s).' % content
['Action'])
181 status
= content
['Status']
182 diag_code
= content
['Diagnostic-Code']
184 # Permanent failure state
185 if int(status
[:1]) == 5:
188 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
189 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
193 "insufficient system storage",
197 if 'quota' in status
.lower():
199 if diag_code
is not None:
200 ldiag_code
= diag_code
.lower()
201 if any(hint
in ldiag_code
for hint
in failure_hints
):
204 print('! Not a permanent failure status (%s).' % status
)
205 if diag_code
is not None:
206 print('! Diagnostic code was: %s' % diag_code
)
210 def findAddressInPlainBounce(bounce
, real_bounce
=None):
211 """Finds the faulty email address in a non-RFC-1894 bounced email
213 # real_bounce is the full email and bounce only the text/plain part, if email have several MIME parts
214 real_bounce
= real_bounce
or bounce
215 if 'MAILER-DAEMON@' not in real_bounce
['From'].upper():
216 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce
['From'])
218 if bounce
.get_content_type() != 'text/plain':
219 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
221 subject
= findSubject(real_bounce
).lower()
223 "delivery status notification (failure)",
225 "returned mail: see transcript for details",
226 "undeliverable message",
227 "undelivered mail returned to sender",
229 if subject
not in known_subjects
and not subject
.startswith('mail delivery failed'):
230 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
233 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
234 lines
= bounce
.get_payload().splitlines()[:15]
236 # ALTOSPAM is a service which requires to click on a link when sending an email
237 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
238 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
239 if any("ALTOSPAM which is used by the person" in line
for line
in lines
):
240 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
244 # A message that you sent could not be delivered to one or more of its recipients.
245 # I'm afraid I wasn't able to deliver your message to the following addresses.
246 # The following message to <email@example.com> was undeliverable.
247 non_delivery_hints
= [
248 "Delivery to the following recipient failed permanently",
249 "I'm sorry to have to inform you that your message could not",
250 "I wasn't able to deliver your message",
251 "> was undeliverable.",
252 "could not be delivered to",
253 "we were unable to deliver your message",
255 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
256 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
257 print('\n'.join(lines
))
261 # This is a permanent error; I've given up. Sorry it didn't work out.
262 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
263 permanent_error_hints
= [
264 "Delivery to the following recipient failed permanently",
265 "I'm sorry to have to inform you that your message could not",
266 "This is a permanent error",
267 "Unknown address error",
268 "550 Requested action not taken",
270 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
271 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
272 print('\n'.join(lines
))
275 # Retrieve the first occurence of <email@example.com>
277 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
279 match
= re
.match(r
'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line
)
280 if match
is not None:
281 email
= match
.group(1)
282 if email
.endswith('@polytechnique.org'):
283 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
287 print('! Unknown mailer-daemon message, unable to find email address:')
288 print('\n'.join(lines
))
291 #----------------------------------------------------------------------------#
293 class DirectBouncesFilter(MboxFilter
):
295 def initialize(self
, mbox_file
):
297 self
.bad_problems
= 0
299 self
.mbox_file
= '%s.bounced' % mbox_file
300 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
303 def process(self
, message
):
304 if message
['X-Spam-Flag'] is None:
305 # During finalization, we will verifiy that all messages were processed
307 # Special case: ignore mailman notifications for the mailing-list
308 # on which the NL is forwarded
309 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
310 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
313 # Additionnal checks, just to be sure
314 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
315 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
316 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
318 email
= findAddressInBounce(message
)
319 if email
is not None:
320 self
.emails
.append(email
)
321 self
.mbox
.add(message
)
324 print('! => No email found in direct bounce, this is really bad.')
325 self
.bad_problems
+= 1
329 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
330 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
331 print('They were saved in %s.' % self
.mbox_file
)
332 if self
.bad_problems
:
333 print('Found %d of them that are invalid.' % self
.bad_problems
)
334 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
335 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
337 print('Here is the list of email adresses for these bounces:')
339 for email
in self
.emails
:
344 #----------------------------------------------------------------------------#
346 class SpamFilter(MboxFilter
):
348 def initialize(self
, mbox_file
):
349 self
.mbox_file
= '%s.spam' % mbox_file
350 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
353 def process(self
, message
):
354 if message
['X-Spam-Flag'] is not None \
355 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
356 self
.mbox
.add(message
)
361 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
362 print('They were saved in %s.' % self
.mbox_file
)
363 print('You might check the contents of this mbox.')
366 #----------------------------------------------------------------------------#
368 class UnsureFilter(MboxFilter
):
370 def initialize(self
, mbox_file
):
371 self
.mbox_file
= '%s.unsure' % mbox_file
372 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
375 def process(self
, message
):
376 if message
['X-Spam-Flag'] is not None \
377 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
378 self
.mbox
.add(message
)
383 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
384 print('They were saved in %s.' % self
.mbox_file
)
385 print('You must check the contents of this mbox and feed the antispam.')
388 #----------------------------------------------------------------------------#
390 class CheckNonSpamFilter(MboxFilter
):
392 def initialize(self
, mbox_file
):
395 def process(self
, message
):
396 if message
['X-Spam-Flag'] is None \
397 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
403 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
404 print('Please investigate.')
406 print('All messages were either spam, or unsure, or non-spams. Good.')
408 #----------------------------------------------------------------------------#
410 class OutOfOfficeFilter(MboxFilter
):
412 def initialize(self
, mbox_file
):
413 self
.mbox_file
= '%s.ooo' % mbox_file
414 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
418 r
'^(AUTO: )?Out of (the )?office',
421 r
'^Automatic reply: ',
422 r
'automatique d\'absence
',
426 r'I am currently away
',
427 r'(am|
is) out
of (the
)?office
',
428 r'Notification d
\'absence
',
429 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
431 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
433 def process(self, message):
434 subject = findSubject(message)
435 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
436 self.mbox.add(message)
439 # Some systems reply with "Re: ". Be smart here!
440 if subject is not None and subject.startswith('Re
: '):
441 # Delivered-To: Autoresponder
442 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
443 self.mbox.add(message)
445 # Parse content if it is simple enough
446 if message.get_content_type() == 'text
/plain
':
447 firstline = message.get_payload().splitlines()[0].lower()
448 if (' absent du bureau
' in firstline
449 or ' away
from my office
' in firstline):
450 self.mbox.add(message)
456 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
457 print('They were saved
in %s
.' % self.mbox_file)
458 print('You may check the contents of this mbox
.')
461 #----------------------------------------------------------------------------#
463 class DeliveryStatusNotificationFilter(MboxFilter):
465 def initialize(self, mbox_file):
467 self.mbox_file = '%s
.dsn
' % mbox_file
468 self.mbox = mailbox.mbox(self.mbox_file)
470 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
471 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
472 self.mbox_temp.clear()
474 def process(self, message):
475 # Don't modify message variable
for "self.mbox.add(message)"
476 report_message
= message
477 # Find real report inside attachment
478 if message
.get_content_type() == 'multipart/mixed':
479 report_message
= message
.get_payload(0)
481 # Process report if its type is correct
482 if report_message
.get_content_type() == 'multipart/report':
483 email
= findAddressInBounce(report_message
)
484 if email
is not None:
485 self
.emails
.append(email
)
486 self
.mbox
.add(message
)
488 print("! => Moved to temporary DSN mailbox")
489 self
.mbox_temp
.add(message
)
492 # Detect ill-formatted reports, sent as plain text email
493 if 'MAILER-DAEMON@' in message
['From'].upper() and report_message
.get_content_type() == 'text/plain':
494 email
= findAddressInPlainBounce(report_message
)
495 if email
is not None:
496 self
.emails
.append(email
)
497 self
.mbox
.add(message
)
502 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
503 print('They were saved in %s.' % self
.mbox_file
)
505 print('Here is the list of email adresses for these bounces:')
507 for email
in self
.emails
:
511 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
512 print('They were saved in %s.' % self
.mbox_temp_file
)
513 self
.mbox_temp
.close()
515 #----------------------------------------------------------------------------#
517 class CatchAllFilter(MboxFilter
):
519 def initialize(self
, mbox_file
):
520 self
.mbox_file
= '%s.catchall' % mbox_file
521 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
524 def process(self
, message
):
525 self
.mbox
.add(message
)
529 if len(self
.mbox
) > 0:
530 print('%d messages reached the catchall.' %
len(self
.mbox
))
531 print('They were saved in %s.' % self
.mbox_file
)
532 print('You must process the contents of this mbox manually.')
535 print('No messages reached the catchall. Nice.')
537 os
.unlink(self
.mbox_file
)
539 #----------------------------------------------------------------------------#
541 if __name__
== '__main__':
543 if len(sys
.argv
) != 2:
544 print('Usage: %s mbox' % sys
.argv
[0])
547 if not os
.path
.exists(sys
.argv
[1]):
548 print('No such file: %s' % sys
.argv
[1])
551 processor
= MboxProcessor(sys
.argv
[1])