1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2008 polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
23 # Copyright (c) 2008 Aymeric Augustin
26 Process as automatically as possible bounces from the newsletter
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
34 All emails are saved in different mailboxes to make human post-processing easier.
37 import email
, mailbox
, os
, re
, sys
, time
39 #----------------------------------------------------------------------------#
42 """Applies a series of filters to each message in a mbox."""
44 def __init__(self
, mbox
):
46 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
48 DirectBouncesFilter(),
53 DeliveryStatusNotificationFilter(),
57 def initialize_filters(self
):
58 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
59 self
.start_time
= time
.clock()
61 def apply_filters(self
, message
):
62 return any(f
.process(message
) for f
in self
.filters
)
64 def finalize_filters(self
):
65 duration
= time
.clock() - self
.start_time
68 print 'Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
)
70 for f
in self
.filters
:
77 self
.initialize_filters()
78 for message
in self
.mbox
: self
.apply_filters(message
)
79 self
.finalize_filters()
84 #----------------------------------------------------------------------------#
87 """Defines an interface for filters."""
89 def initialize(self
, mbox_file
):
90 """Called by the processor before processing starts.
92 This is the place to open descriptors required during processing."""
95 def process(self
, message
):
96 """Called by the processor for each message that reaches this step.
98 Return true to stop processing, and false to go to the next filter."""
102 """Called by the processor after processing ends.
104 This is the place to display the results and close all descriptors."""
107 #----------------------------------------------------------------------------#
109 def findSubject(message
):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message
['Subject'] is not None:
113 return unicode(email
.header
.make_header(email
.header
.decode_header(message
['Subject'])))
118 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
120 def findAddressInBounce(bounce
):
121 """Finds the faulty email address in a bounced email.
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce
.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type()
129 # Extract the second component of the multipart/report
130 num_payloads
= len(bounce
.get_payload())
132 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
134 status
= bounce
.get_payload(1)
135 if status
.get_content_type() != 'message/delivery-status':
136 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce
.get_content_type()
138 # The per-message-fields don't matter here, get only the per-recipient-fields
139 num_payloads
= len(status
.get_payload())
141 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
143 content
= status
.get_payload(1)
144 if content
.get_content_type() != 'text/plain':
145 print '! Not a valid bounce (expected text/plain, found %s).' % bounce
.get_content_type
147 # Extract the faulty email address
148 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
149 if recipient_match
is None:
150 print '! Missing final recipient.'
152 email
= recipient_match
.group(1)
153 # Check the action field
154 if content
['Action'] != 'failed':
155 print '! Not a failed action (%s).' % content
['Action']
157 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
158 # Otherwise, the first sub-field should indicate a permanent failure
159 postfix_error
= content
['Diagnostic-Code'] is not None \
160 and content
['Diagnostic-Code'].startswith('X-Postfix')
161 if not postfix_error
and int(content
['Status'][:1]) != 5:
162 print '! Not a permanent failure status (%s).' % content
['Status']
166 #----------------------------------------------------------------------------#
168 class DirectBouncesFilter(MboxFilter
):
170 def initialize(self
, mbox_file
):
173 self
.mbox_file
= '%s.bounced' % mbox_file
174 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
177 def process(self
, message
):
178 if message
['X-Spam-Flag'] is None:
179 # During finalization, we will verifiy that all messages were processed
181 # Special case: ignore mailman notifications for the mailing-list
182 # on which the NL is forwarded
183 if message
['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org':
184 print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.'
187 # Additionnal checks, just to be sure
188 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
189 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
190 print '! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject'])
192 email
= findAddressInBounce(message
)
193 if email
is not None:
194 self
.emails
.append(email
)
195 self
.mbox
.add(message
)
198 print '! No email found in direct bounce, this is really bad.'
202 print 'Found %d messages with no X-Spam-Flag header.' % self
.seen
203 print 'Found %d of them that are confirmed bounces.' %
len(self
.mbox
)
204 if self
.seen
!= len(self
.mbox
):
205 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
206 print 'They were saved in %s.' % self
.mbox_file
208 print 'Here is the list of email adresses for these bounces:'
210 for email
in self
.emails
:
215 #----------------------------------------------------------------------------#
217 class SpamFilter(MboxFilter
):
219 def initialize(self
, mbox_file
):
220 self
.mbox_file
= '%s.spam' % mbox_file
221 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
224 def process(self
, message
):
225 if message
['X-Spam-Flag'] is not None \
226 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
227 self
.mbox
.add(message
)
232 print 'Found %d spams. This is reliable.' %
len(self
.mbox
)
233 print 'They were saved in %s.' % self
.mbox_file
234 print 'You might check the contents of this mbox.'
237 #----------------------------------------------------------------------------#
239 class UnsureFilter(MboxFilter
):
241 def initialize(self
, mbox_file
):
242 self
.mbox_file
= '%s.unsure' % mbox_file
243 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
246 def process(self
, message
):
247 if message
['X-Spam-Flag'] is not None \
248 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
249 self
.mbox
.add(message
)
254 print 'Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
)
255 print 'They were saved in %s.' % self
.mbox_file
256 print 'You must check the contents of this mbox and feed the antispam.'
259 #----------------------------------------------------------------------------#
261 class CheckNonSpamFilter(MboxFilter
):
263 def initialize(self
, mbox_file
):
266 def process(self
, message
):
267 if message
['X-Spam-Flag'] is None \
268 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
274 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
275 print 'Please investigate.'
277 print 'All messages were either spam, or unsure, or non-spams. Good.'
279 #----------------------------------------------------------------------------#
281 class OutOfOfficeFilter(MboxFilter
):
283 def initialize(self
, mbox_file
):
284 self
.mbox_file
= '%s.ooo' % mbox_file
285 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
290 r
'^Out of (the )?office',
291 r
'is out of (the )?office',
293 r
'automatique d\'absence
',
294 r'Notification d
\'absence
'
295 u'RĂ©ponse automatique
:', #unicode!
298 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
300 def process(self, message):
301 subject = findSubject(message)
302 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
303 self.mbox.add(message)
308 print 'Found %d
"out of office". This
is generally reliable
.' % len(self.mbox)
309 print 'They were saved
in %s
.' % self.mbox_file
310 print 'You may check the contents of this mbox
.'
313 #----------------------------------------------------------------------------#
315 class DeliveryStatusNotificationFilter(MboxFilter):
317 def initialize(self, mbox_file):
319 self.mbox_file = '%s
.dsn
' % mbox_file
320 self.mbox = mailbox.mbox(self.mbox_file)
323 def process(self, message):
324 if message.get_content_type() == 'multipart
/report
':
325 email = findAddressInBounce(message)
326 if email is not None:
327 self.emails.append(email)
328 self.mbox.add(message)
333 print 'Found %d delivery status notifications
. This
is generally reliable
.' % len(self.mbox)
334 print 'They were saved
in %s
.' % self.mbox_file
336 print 'Here
is the
list of email adresses
for these bounces
:'
338 for email in self.emails:
343 #----------------------------------------------------------------------------#
345 class CatchAllFilter(MboxFilter):
347 def initialize(self, mbox_file):
348 self.mbox_file = '%s
.catchall
' % mbox_file
349 self.mbox = mailbox.mbox(self.mbox_file)
352 def process(self, message):
353 self.mbox.add(message)
357 if len(self.mbox) > 0:
358 print '%d messages reached the catchall
.' % len(self.mbox)
359 print 'They were saved
in %s
.' % self.mbox_file
360 print 'You must process the contents of this mbox manually
.'
363 print 'No messages reached the catchall
. Nice
.'
365 os.unlink(self.mbox_file)
367 #----------------------------------------------------------------------------#
369 if __name__ == '__main__
':
371 if len(sys.argv) != 2:
372 print 'Usage
: %s mbox
' % sys.argv[0]
375 if not os.path.exists(sys.argv[1]):
376 print 'No such
file: %s
' % sys.argv[1]
379 processor = MboxProcessor(sys.argv[1])