2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
126 # Some MTA set the Final-Recipient with "LOCAL;" instead of "rfc822;"
127 _recipient_re2
= re
.compile(r
'^local; ?(.+)$', re
.I | re
.U
)
130 def findAddressInBounce(bounce
):
131 """Finds the faulty email address in a bounced email.
133 See RFC 1894 for more information.
134 Returns None or the email address."""
136 # Check that it is a bounce - a few MTA fail to set this correctly :(
137 if bounce
.get_content_type() != 'multipart/report':
138 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
140 # Extract the second component of the multipart/report
141 num_payloads
= len(bounce
.get_payload())
143 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
145 status
= bounce
.get_payload(1)
146 if status
.get_content_type() != 'message/delivery-status':
147 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
149 # The per-message-fields don't matter here, get only the per-recipient-fields
150 num_payloads
= len(status
.get_payload())
152 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
154 content
= status
.get_payload(1)
155 if content
.get_content_type() != 'text/plain':
156 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
158 # Extract the faulty email address
159 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
160 if recipient_match
is None:
161 # Be nice, test another regexp
162 recipient_match
= _recipient_re2
.search(content
['Final-Recipient'])
163 if recipient_match
is None:
164 print('! Missing final recipient.')
166 email
= recipient_match
.group(1)
167 # Check the action field
168 if content
['Action'].lower() != 'failed':
169 print('! Not a failed action (%s).' % content
['Action'])
172 status
= content
['Status']
173 diag_code
= content
['Diagnostic-Code']
175 # Permanent failure state
176 if int(status
[:1]) == 5:
179 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
180 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
184 "insufficient system storage",
188 if 'quota' in status
.lower():
190 if diag_code
is not None:
191 ldiag_code
= diag_code
.lower()
192 if any(hint
in ldiag_code
for hint
in failure_hints
):
195 print('! Not a permanent failure status (%s).' % status
)
196 if diag_code
is not None:
197 print('! Diagnostic code was: %s' % diag_code
)
201 def findAddressInPlainBounce(bounce
):
202 """Finds the faulty email address in a non-RFC-1894 bounced email
204 if 'MAILER-DAEMON@' not in bounce
['From'].upper():
205 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce
['From'])
207 if bounce
.get_content_type() != 'text/plain':
208 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
210 subject
= findSubject(bounce
).lower()
211 if (subject
!= 'failure notice'
212 and subject
!= 'undeliverable message'
213 and not subject
.startswith('mail delivery failed')
214 and subject
!= 'delivery status notification (failure)'):
216 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
219 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
220 lines
= bounce
.get_payload().splitlines()[:15]
222 # ALTOSPAM is a service which requires to click on a link when sending an email
223 # Don't consider the "554 5.0.0 Service unavailable" returned by ALTOSPAM as a failure
224 # but put this message in the dsn-temp mailbox so that it can be processed by hand.
225 if any("ALTOSPAM which is used by the person" in line
for line
in lines
):
226 print('! ALTOSPAM has been detected. Moving this message to the dsn-temp mbox')
230 # A message that you sent could not be delivered to one or more of its recipients.
231 # I'm afraid I wasn't able to deliver your message to the following addresses.
232 # The following message to <email@example.com> was undeliverable.
233 non_delivery_hints
= [
234 "Delivery to the following recipient failed permanently",
235 "I wasn't able to deliver your message",
236 "> was undeliverable.",
237 "could not be delivered to",
238 "we were unable to deliver your message",
240 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
241 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
242 print('\n'.join(lines
))
246 # This is a permanent error; I've given up. Sorry it didn't work out.
247 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
248 permanent_error_hints
= [
249 "Delivery to the following recipient failed permanently",
250 "This is a permanent error",
251 "Unknown address error",
252 "550 Requested action not taken",
254 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
255 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
256 print('\n'.join(lines
))
259 # Retrieve the first occurence of <email@example.com>
261 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
263 match
= re
.match(r
'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line
)
264 if match
is not None:
265 email
= match
.group(1)
266 if email
.endswith('@polytechnique.org'):
267 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
271 print('! Unknown mailer-daemon message, unable to find email address:')
272 print('\n'.join(lines
))
275 #----------------------------------------------------------------------------#
277 class DirectBouncesFilter(MboxFilter
):
279 def initialize(self
, mbox_file
):
281 self
.bad_problems
= 0
283 self
.mbox_file
= '%s.bounced' % mbox_file
284 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
287 def process(self
, message
):
288 if message
['X-Spam-Flag'] is None:
289 # During finalization, we will verifiy that all messages were processed
291 # Special case: ignore mailman notifications for the mailing-list
292 # on which the NL is forwarded
293 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
294 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
297 # Additionnal checks, just to be sure
298 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
299 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
300 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
302 email
= findAddressInBounce(message
)
303 if email
is not None:
304 self
.emails
.append(email
)
305 self
.mbox
.add(message
)
308 print('! => No email found in direct bounce, this is really bad.')
309 self
.bad_problems
+= 1
313 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
314 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
315 print('They were saved in %s.' % self
.mbox_file
)
316 if self
.bad_problems
:
317 print('Found %d of them that are invalid.' % self
.bad_problems
)
318 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
319 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
321 print('Here is the list of email adresses for these bounces:')
323 for email
in self
.emails
:
328 #----------------------------------------------------------------------------#
330 class SpamFilter(MboxFilter
):
332 def initialize(self
, mbox_file
):
333 self
.mbox_file
= '%s.spam' % mbox_file
334 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
337 def process(self
, message
):
338 if message
['X-Spam-Flag'] is not None \
339 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
340 self
.mbox
.add(message
)
345 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
346 print('They were saved in %s.' % self
.mbox_file
)
347 print('You might check the contents of this mbox.')
350 #----------------------------------------------------------------------------#
352 class UnsureFilter(MboxFilter
):
354 def initialize(self
, mbox_file
):
355 self
.mbox_file
= '%s.unsure' % mbox_file
356 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
359 def process(self
, message
):
360 if message
['X-Spam-Flag'] is not None \
361 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
362 self
.mbox
.add(message
)
367 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
368 print('They were saved in %s.' % self
.mbox_file
)
369 print('You must check the contents of this mbox and feed the antispam.')
372 #----------------------------------------------------------------------------#
374 class CheckNonSpamFilter(MboxFilter
):
376 def initialize(self
, mbox_file
):
379 def process(self
, message
):
380 if message
['X-Spam-Flag'] is None \
381 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
387 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
388 print('Please investigate.')
390 print('All messages were either spam, or unsure, or non-spams. Good.')
392 #----------------------------------------------------------------------------#
394 class OutOfOfficeFilter(MboxFilter
):
396 def initialize(self
, mbox_file
):
397 self
.mbox_file
= '%s.ooo' % mbox_file
398 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
402 r
'^(AUTO: )?Out of (the )?office',
405 r
'^Automatic reply: ',
406 r
'automatique d\'absence
',
410 r'I am currently away
',
411 r'(am|
is) out
of (the
)?office
',
412 r'Notification d
\'absence
',
413 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
415 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
417 def process(self, message):
418 subject = findSubject(message)
419 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
420 self.mbox.add(message)
423 # Some systems reply with "Re: ". Be smart here!
424 if subject is not None and subject.startswith('Re
: '):
425 # Delivered-To: Autoresponder
426 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
427 self.mbox.add(message)
429 # Parse content if it is simple enough
430 if message.get_content_type() == 'text
/plain
':
431 firstline = message.get_payload().splitlines()[0].lower()
432 if (' absent du bureau
' in firstline
433 or ' away
from my office
' in firstline):
434 self.mbox.add(message)
440 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
441 print('They were saved
in %s
.' % self.mbox_file)
442 print('You may check the contents of this mbox
.')
445 #----------------------------------------------------------------------------#
447 class DeliveryStatusNotificationFilter(MboxFilter):
449 def initialize(self, mbox_file):
451 self.mbox_file = '%s
.dsn
' % mbox_file
452 self.mbox = mailbox.mbox(self.mbox_file)
454 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
455 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
456 self.mbox_temp.clear()
458 def process(self, message):
459 # Don't modify message variable
for "self.mbox.add(message)"
460 report_message
= message
461 # Find real report inside attachment
462 if message
.get_content_type() == 'multipart/mixed':
463 report_message
= message
.get_payload(0)
465 # Process report if its type is correct
466 if report_message
.get_content_type() == 'multipart/report':
467 email
= findAddressInBounce(report_message
)
468 if email
is not None:
469 self
.emails
.append(email
)
470 self
.mbox
.add(message
)
472 print("! => Moved to temporary DSN mailbox")
473 self
.mbox_temp
.add(message
)
476 # Detect ill-formatted reports, sent as plain text email
477 if 'MAILER-DAEMON@' in message
['From'].upper() and report_message
.get_content_type() == 'text/plain':
478 email
= findAddressInPlainBounce(report_message
)
479 if email
is not None:
480 self
.emails
.append(email
)
481 self
.mbox
.add(message
)
486 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
487 print('They were saved in %s.' % self
.mbox_file
)
489 print('Here is the list of email adresses for these bounces:')
491 for email
in self
.emails
:
495 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
496 print('They were saved in %s.' % self
.mbox_temp_file
)
497 self
.mbox_temp
.close()
499 #----------------------------------------------------------------------------#
501 class CatchAllFilter(MboxFilter
):
503 def initialize(self
, mbox_file
):
504 self
.mbox_file
= '%s.catchall' % mbox_file
505 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
508 def process(self
, message
):
509 self
.mbox
.add(message
)
513 if len(self
.mbox
) > 0:
514 print('%d messages reached the catchall.' %
len(self
.mbox
))
515 print('They were saved in %s.' % self
.mbox_file
)
516 print('You must process the contents of this mbox manually.')
519 print('No messages reached the catchall. Nice.')
521 os
.unlink(self
.mbox_file
)
523 #----------------------------------------------------------------------------#
525 if __name__
== '__main__':
527 if len(sys
.argv
) != 2:
528 print('Usage: %s mbox' % sys
.argv
[0])
531 if not os
.path
.exists(sys
.argv
[1]):
532 print('No such file: %s' % sys
.argv
[1])
535 processor
= MboxProcessor(sys
.argv
[1])