2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
128 def findAddressInBounce(bounce
):
129 """Finds the faulty email address in a bounced email.
131 See RFC 1894 for more information.
132 Returns None or the email address."""
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce
.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
138 # Extract the second component of the multipart/report
139 num_payloads
= len(bounce
.get_payload())
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
143 status
= bounce
.get_payload(1)
144 if status
.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads
= len(status
.get_payload())
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
152 content
= status
.get_payload(1)
153 if content
.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
156 # Extract the faulty email address
157 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
158 if recipient_match
is None:
159 print('! Missing final recipient.')
161 email
= recipient_match
.group(1)
162 # Check the action field
163 if content
['Action'].lower() != 'failed':
164 print('! Not a failed action (%s).' % content
['Action'])
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error
= content
['Diagnostic-Code'] is not None \
169 and content
['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error
and int(content
['Status'][:1]) != 5:
171 print('! Not a permanent failure status (%s).' % content
['Status'])
176 def findAddressInPlainBounce(bounce
):
177 """Finds the faulty email address in a non-RFC-1894 bounced email
179 if 'MAILER-DAEMON@' not in bounce
['From'].upper():
180 print('! Not a valid plain bounce (expected from MAILER-DAEMON, found %s).' % bounce
['From'])
182 if bounce
.get_content_type() != 'text/plain':
183 print('! Not a valid plain bounce (expected text/plain, found %s).' % bounce
.get_content_type())
185 subject
= findSubject(bounce
).lower()
186 if (subject
!= 'failure notice'
187 and subject
!= 'undeliverable message'
188 and not subject
.startswith('mail delivery failed')
189 and subject
!= 'delivery status notification (failure)'):
191 print('! Not a valid plain bounce (unknown subject: %s).' % subject
)
194 # Read the 15 first lines of content and find some relevant keywords to validate the bounce
195 lines
= bounce
.get_payload().splitlines()[:15]
198 # A message that you sent could not be delivered to one or more of its recipients.
199 # I'm afraid I wasn't able to deliver your message to the following addresses.
200 # The following message to <email@example.com> was undeliverable.
201 non_delivery_hints
= [
202 "Delivery to the following recipient failed permanently",
203 "I wasn't able to deliver your message",
204 "> was undeliverable.",
205 "could not be delivered to",
206 "we were unable to deliver your message",
208 if not any(any(hint
in line
for hint
in non_delivery_hints
) for line
in lines
):
209 print('! Unknown mailer-daemon message, unable to find an hint for non-delivery in message:')
210 print('\n'.join(lines
))
214 # This is a permanent error; I've given up. Sorry it didn't work out.
215 # 5.1.0 - Unknown address error 550-'email@example.com... No such user'
216 permanent_error_hints
= [
217 "Delivery to the following recipient failed permanently",
218 "This is a permanent error",
219 "Unknown address error",
220 "550 Requested action not taken",
222 if not any(any(hint
in line
for hint
in permanent_error_hints
) for line
in lines
):
223 print('! Unknown mailer-daemon message, unable to find an hint for permanent error in message:')
224 print('\n'.join(lines
))
227 # Retrieve the first occurence of <email@example.com>
229 match
= re
.match(r
'.*?<([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)>', line
)
231 match
= re
.match(r
'^\s*([0-9a-zA-Z_.-]+@[0-9a-zA-Z_.-]+)\s*$', line
)
232 if match
is not None:
233 email
= match
.group(1)
234 if email
.endswith('@polytechnique.org'):
235 # First valid mail is something like <info_newsletter@polytechnique.org>, so we missed the real one
239 print('! Unknown mailer-daemon message, unable to find email address:')
240 print('\n'.join(lines
))
243 #----------------------------------------------------------------------------#
245 class DirectBouncesFilter(MboxFilter
):
247 def initialize(self
, mbox_file
):
249 self
.bad_problems
= 0
251 self
.mbox_file
= '%s.bounced' % mbox_file
252 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
255 def process(self
, message
):
256 if message
['X-Spam-Flag'] is None:
257 # During finalization, we will verifiy that all messages were processed
259 # Special case: ignore mailman notifications for the mailing-list
260 # on which the NL is forwarded
261 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
262 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
265 # Additionnal checks, just to be sure
266 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
267 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
268 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
270 email
= findAddressInBounce(message
)
271 if email
is not None:
272 self
.emails
.append(email
)
273 self
.mbox
.add(message
)
276 print('! => No email found in direct bounce, this is really bad.')
277 self
.bad_problems
+= 1
281 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
282 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
283 print('They were saved in %s.' % self
.mbox_file
)
284 if self
.bad_problems
:
285 print('Found %d of them that are invalid.' % self
.bad_problems
)
286 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
287 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
289 print('Here is the list of email adresses for these bounces:')
291 for email
in self
.emails
:
296 #----------------------------------------------------------------------------#
298 class SpamFilter(MboxFilter
):
300 def initialize(self
, mbox_file
):
301 self
.mbox_file
= '%s.spam' % mbox_file
302 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
305 def process(self
, message
):
306 if message
['X-Spam-Flag'] is not None \
307 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
308 self
.mbox
.add(message
)
313 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
314 print('They were saved in %s.' % self
.mbox_file
)
315 print('You might check the contents of this mbox.')
318 #----------------------------------------------------------------------------#
320 class UnsureFilter(MboxFilter
):
322 def initialize(self
, mbox_file
):
323 self
.mbox_file
= '%s.unsure' % mbox_file
324 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
327 def process(self
, message
):
328 if message
['X-Spam-Flag'] is not None \
329 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
330 self
.mbox
.add(message
)
335 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
336 print('They were saved in %s.' % self
.mbox_file
)
337 print('You must check the contents of this mbox and feed the antispam.')
340 #----------------------------------------------------------------------------#
342 class CheckNonSpamFilter(MboxFilter
):
344 def initialize(self
, mbox_file
):
347 def process(self
, message
):
348 if message
['X-Spam-Flag'] is None \
349 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
355 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
356 print('Please investigate.')
358 print('All messages were either spam, or unsure, or non-spams. Good.')
360 #----------------------------------------------------------------------------#
362 class OutOfOfficeFilter(MboxFilter
):
364 def initialize(self
, mbox_file
):
365 self
.mbox_file
= '%s.ooo' % mbox_file
366 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
370 r
'^(AUTO: )?Out of (the )?office',
372 r
'^Automatic reply: ',
373 r
'automatique d\'absence
',
377 r'I am currently away
',
378 r'(am|
is) out
of (the
)?office
',
379 r'Notification d
\'absence
',
380 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
382 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
384 def process(self, message):
385 subject = findSubject(message)
386 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
387 self.mbox.add(message)
390 # Some systems reply with "Re: ". Be smart here!
391 if subject is not None and subject.startswith('Re
: '):
392 # Delivered-To: Autoresponder
393 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
394 self.mbox.add(message)
396 # Parse content if it is simple enough
397 if message.get_content_type() == 'text
/plain
':
398 firstline = message.get_payload().splitlines()[0].lower()
399 if (' absent du bureau
' in firstline
400 or ' away
from my office
' in firstline):
401 self.mbox.add(message)
407 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
408 print('They were saved
in %s
.' % self.mbox_file)
409 print('You may check the contents of this mbox
.')
412 #----------------------------------------------------------------------------#
414 class DeliveryStatusNotificationFilter(MboxFilter):
416 def initialize(self, mbox_file):
418 self.mbox_file = '%s
.dsn
' % mbox_file
419 self.mbox = mailbox.mbox(self.mbox_file)
421 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
422 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
423 self.mbox_temp.clear()
425 def process(self, message):
426 # Don't modify message variable
for "self.mbox.add(message)"
427 report_message
= message
428 # Find real report inside attachment
429 if message
.get_content_type() == 'multipart/mixed':
430 report_message
= message
.get_payload(0)
432 # Process report if its type is correct
433 if report_message
.get_content_type() == 'multipart/report':
434 email
= findAddressInBounce(report_message
)
435 if email
is not None:
436 self
.emails
.append(email
)
437 self
.mbox
.add(message
)
439 print("! => Moved to temporary DSN mailbox")
440 self
.mbox_temp
.add(message
)
443 # Detect ill-formatted reports, sent as plain text email
444 if 'MAILER-DAEMON@' in message
['From'].upper():
445 email
= findAddressInPlainBounce(message
)
446 if email
is not None:
447 self
.emails
.append(email
)
448 self
.mbox
.add(message
)
453 print('Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
))
454 print('They were saved in %s.' % self
.mbox_file
)
456 print('Here is the list of email adresses for these bounces:')
458 for email
in self
.emails
:
462 print('Found %d temporary and invalid delivery status notifications.' %
len(self
.mbox_temp
))
463 print('They were saved in %s.' % self
.mbox_temp_file
)
464 self
.mbox_temp
.close()
466 #----------------------------------------------------------------------------#
468 class CatchAllFilter(MboxFilter
):
470 def initialize(self
, mbox_file
):
471 self
.mbox_file
= '%s.catchall' % mbox_file
472 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
475 def process(self
, message
):
476 self
.mbox
.add(message
)
480 if len(self
.mbox
) > 0:
481 print('%d messages reached the catchall.' %
len(self
.mbox
))
482 print('They were saved in %s.' % self
.mbox_file
)
483 print('You must process the contents of this mbox manually.')
486 print('No messages reached the catchall. Nice.')
488 os
.unlink(self
.mbox_file
)
490 #----------------------------------------------------------------------------#
492 if __name__
== '__main__':
494 if len(sys
.argv
) != 2:
495 print('Usage: %s mbox' % sys
.argv
[0])
498 if not os
.path
.exists(sys
.argv
[1]):
499 print('No such file: %s' % sys
.argv
[1])
502 processor
= MboxProcessor(sys
.argv
[1])