2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
128 def findAddressInBounce(bounce
):
129 """Finds the faulty email address in a bounced email.
131 See RFC 1894 for more information.
132 Returns None or the email address."""
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce
.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
138 # Extract the second component of the multipart/report
139 num_payloads
= len(bounce
.get_payload())
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
143 status
= bounce
.get_payload(1)
144 if status
.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads
= len(status
.get_payload())
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
152 content
= status
.get_payload(1)
153 if content
.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
156 # Extract the faulty email address
157 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
158 if recipient_match
is None:
159 print('! Missing final recipient.')
161 email
= recipient_match
.group(1)
162 # Check the action field
163 if content
['Action'].lower() != 'failed':
164 print('! Not a failed action (%s).' % content
['Action'])
167 status
= content
['Status']
168 diag_code
= content
['Diagnostic-Code']
170 # Permanent failure state
171 if int(status
[:1]) == 5:
174 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
175 if diag_code
is not None and diag_code
.startswith('X-Postfix'):
179 "insufficient system storage",
183 if 'quota' in status
.lower():
185 if diag_code
is not None:
186 ldiag_code
= diag_code
.lower()
187 if any(hint
in ldiag_code
for hint
in failure_hints
):
190 print('! Not a permanent failure status (%s).' % status
)
191 if diag_code
is not None:
192 print('! Diagnostic code was: %s' % diag_code
)
196 def findAddressInPlainBounce(bounce
):
197 """Finds the faulty email address in a non-RFC-1894 bounced email
199 if 'MAILER-DAEMON@' not in bounce
['From'].upper():
200 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce
['From'])
202 if bounce
.get_content_type() != 'text/plain':
203 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
205 subject
= findSubject(bounce
).lower()
206 if (subject
!= 'failure notice'
207 and subject
!= 'undeliverable message'
208 and not subject
.startswith('mail delivery failed')
209 and subject
!= 'delivery status notification (failure)'):
211 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
214 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
215 lines
= bounce
.get_payload().splitlines()[:15]
218 # A message that you sent could not be delivered to one or more of its recipients.
219 # I'm afraid I wasn't able to deliver your message to the following addresses.
220 # The following message to <email@example.com> was undeliverable.
221 non_delivery_hints
= [
222 "Delivery to the following recipient failed permanently",
223 "I wasn't able to deliver your message",
224 "> was undeliverable.",
225 "could not be delivered to",
226 "we were unable to deliver your message",
228 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
229 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
230 print('\n'.join(lines
))
234 # This is a permanent error; I've given up. Sorry it didn't work out.
235 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
236 permanent_error_hints
= [
237 "Delivery to the following recipient failed permanently",
238 "This is a permanent error",
239 "Unknown address error",
240 "550 Requested action not taken",
242 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
243 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
244 print('\n'.join(lines
))
247 # Retrieve the first occurence of <email@example.com>
249 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
251 match
= re
.match(r
'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line
)
252 if match
is not None:
253 email
= match
.group(1)
254 if email
.endswith('@polytechnique.org'):
255 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
259 print('! Unknown mailer-daemon message, unable to find email address:')
260 print('\n'.join(lines
))
263 #----------------------------------------------------------------------------#
265 class DirectBouncesFilter(MboxFilter
):
267 def initialize(self
, mbox_file
):
269 self
.bad_problems
= 0
271 self
.mbox_file
= '%s.bounced' % mbox_file
272 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
275 def process(self
, message
):
276 if message
['X-Spam-Flag'] is None:
277 # During finalization, we will verifiy that all messages were processed
279 # Special case: ignore mailman notifications for the mailing-list
280 # on which the NL is forwarded
281 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
282 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
285 # Additionnal checks, just to be sure
286 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
287 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
288 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
290 email
= findAddressInBounce(message
)
291 if email
is not None:
292 self
.emails
.append(email
)
293 self
.mbox
.add(message
)
296 print('! => No email found in direct bounce, this is really bad.')
297 self
.bad_problems
+= 1
301 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
302 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
303 print('They were saved in %s.' % self
.mbox_file
)
304 if self
.bad_problems
:
305 print('Found %d of them that are invalid.' % self
.bad_problems
)
306 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
307 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
309 print('Here is the list of email adresses for these bounces:')
311 for email
in self
.emails
:
316 #----------------------------------------------------------------------------#
318 class SpamFilter(MboxFilter
):
320 def initialize(self
, mbox_file
):
321 self
.mbox_file
= '%s.spam' % mbox_file
322 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
325 def process(self
, message
):
326 if message
['X-Spam-Flag'] is not None \
327 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
328 self
.mbox
.add(message
)
333 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
334 print('They were saved in %s.' % self
.mbox_file
)
335 print('You might check the contents of this mbox.')
338 #----------------------------------------------------------------------------#
340 class UnsureFilter(MboxFilter
):
342 def initialize(self
, mbox_file
):
343 self
.mbox_file
= '%s.unsure' % mbox_file
344 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
347 def process(self
, message
):
348 if message
['X-Spam-Flag'] is not None \
349 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
350 self
.mbox
.add(message
)
355 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
356 print('They were saved in %s.' % self
.mbox_file
)
357 print('You must check the contents of this mbox and feed the antispam.')
360 #----------------------------------------------------------------------------#
362 class CheckNonSpamFilter(MboxFilter
):
364 def initialize(self
, mbox_file
):
367 def process(self
, message
):
368 if message
['X-Spam-Flag'] is None \
369 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
375 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
376 print('Please investigate.')
378 print('All messages were either spam, or unsure, or non-spams. Good.')
380 #----------------------------------------------------------------------------#
382 class OutOfOfficeFilter(MboxFilter
):
384 def initialize(self
, mbox_file
):
385 self
.mbox_file
= '%s.ooo' % mbox_file
386 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
390 r
'^(AUTO: )?Out of (the )?office',
392 r
'^Automatic reply: ',
393 r
'automatique d\'absence
',
397 r'I am currently away
',
398 r'(am|
is) out
of (the
)?office
',
399 r'Notification d
\'absence
',
400 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
402 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
404 def process(self, message):
405 subject = findSubject(message)
406 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
407 self.mbox.add(message)
410 # Some systems reply with "Re: ". Be smart here!
411 if subject is not None and subject.startswith('Re
: '):
412 # Delivered-To: Autoresponder
413 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
414 self.mbox.add(message)
416 # Parse content if it is simple enough
417 if message.get_content_type() == 'text
/plain
':
418 firstline = message.get_payload().splitlines()[0].lower()
419 if (' absent du bureau
' in firstline
420 or ' away
from my office
' in firstline):
421 self.mbox.add(message)
427 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
428 print('They were saved
in %s
.' % self.mbox_file)
429 print('You may check the contents of this mbox
.')
432 #----------------------------------------------------------------------------#
434 class DeliveryStatusNotificationFilter(MboxFilter):
436 def initialize(self, mbox_file):
438 self.mbox_file = '%s
.dsn
' % mbox_file
439 self.mbox = mailbox.mbox(self.mbox_file)
441 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
442 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
443 self.mbox_temp.clear()
445 def process(self, message):
446 # Don't modify message variable
for "self.mbox.add(message)"
447 report_message
= message
448 # Find real report inside attachment
449 if message
.get_content_type() == 'multipart/mixed':
450 report_message
= message
.get_payload(0)
452 # Process report if its type is correct
453 if report_message
.get_content_type() == 'multipart/report':
454 email
= findAddressInBounce(report_message
)
455 if email
is not None:
456 self
.emails
.append(email
)
457 self
.mbox
.add(message
)
459 print("! => Moved to temporary DSN mailbox")
460 self
.mbox_temp
.add(message
)
463 # Detect ill-formatted reports, sent as plain text email
464 if 'MAILER-DAEMON@' in message
['From'].upper():
465 email
= findAddressInPlainBounce(message
)
466 if email
is not None:
467 self
.emails
.append(email
)
468 self
.mbox
.add(message
)
473 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
474 print('They were saved in %s.' % self
.mbox_file
)
476 print('Here is the list of email adresses for these bounces:')
478 for email
in self
.emails
:
482 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
483 print('They were saved in %s.' % self
.mbox_temp_file
)
484 self
.mbox_temp
.close()
486 #----------------------------------------------------------------------------#
488 class CatchAllFilter(MboxFilter
):
490 def initialize(self
, mbox_file
):
491 self
.mbox_file
= '%s.catchall' % mbox_file
492 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
495 def process(self
, message
):
496 self
.mbox
.add(message
)
500 if len(self
.mbox
) > 0:
501 print('%d messages reached the catchall.' %
len(self
.mbox
))
502 print('They were saved in %s.' % self
.mbox_file
)
503 print('You must process the contents of this mbox manually.')
506 print('No messages reached the catchall. Nice.')
508 os
.unlink(self
.mbox_file
)
510 #----------------------------------------------------------------------------#
512 if __name__
== '__main__':
514 if len(sys
.argv
) != 2:
515 print('Usage: %s mbox' % sys
.argv
[0])
518 if not os
.path
.exists(sys
.argv
[1]):
519 print('No such file: %s' % sys
.argv
[1])
522 processor
= MboxProcessor(sys
.argv
[1])