1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2008 polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
23 # Copyright (c) 2008 Aymeric Augustin
26 Process as automatically as possible bounces from the newsletter
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
34 All emails are saved in different mailboxes to make human post-processing easier.
37 import email
, mailbox
, os
, re
, sys
, time
39 #----------------------------------------------------------------------------#
42 """Applies a series of filters to each message in a mbox."""
44 def __init__(self
, mbox
):
46 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
48 DirectBouncesFilter(),
53 DeliveryStatusNotificationFilter(),
57 def initialize_filters(self
):
58 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
59 self
.start_time
= time
.clock()
61 def apply_filters(self
, message
):
62 return any(f
.process(message
) for f
in self
.filters
)
64 def finalize_filters(self
):
65 duration
= time
.clock() - self
.start_time
68 print 'Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
)
70 for f
in self
.filters
:
77 self
.initialize_filters()
78 for message
in self
.mbox
: self
.apply_filters(message
)
79 self
.finalize_filters()
84 #----------------------------------------------------------------------------#
87 """Defines an interface for filters."""
89 def initialize(self
, mbox_file
):
90 """Called by the processor before processing starts.
92 This is the place to open descriptors required during processing."""
95 def process(self
, message
):
96 """Called by the processor for each message that reaches this step.
98 Return true to stop processing, and false to go to the next filter."""
102 """Called by the processor after processing ends.
104 This is the place to display the results and close all descriptors."""
107 #----------------------------------------------------------------------------#
109 def findSubject(message
):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message
['Subject'] is not None:
113 return unicode(email
.header
.make_header(email
.header
.decode_header(message
['Subject'])))
118 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
120 def findAddressInBounce(bounce
):
121 """Finds the faulty email address in a bounced email.
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce
.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type()
129 # Extract the second component of the multipart/report
130 num_payloads
= len(bounce
.get_payload())
132 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
134 status
= bounce
.get_payload(1)
135 if status
.get_content_type() != 'message/delivery-status':
136 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce
.get_content_type()
138 # The per-message-fields don't matter here, get only the per-recipient-fields
139 num_payloads
= len(status
.get_payload())
141 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
143 content
= status
.get_payload(1)
144 if content
.get_content_type() != 'text/plain':
145 print '! Not a valid bounce (expected text/plain, found %s).' % bounce
.get_content_type
147 # Extract the faulty email address
148 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
149 if recipient_match
is None:
150 print '! Missing final recipient.'
152 email
= recipient_match
.group(1)
153 # Check the action field
154 if content
['Action'] != 'failed':
155 print '! Not a failed action (%s).' % content
['Action']
157 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
158 # Otherwise, the first sub-field should indicate a permanent failure
159 postfix_error
= content
['Diagnostic-Code'] is not None \
160 and content
['Diagnostic-Code'].startswith('X-Postfix')
161 if not postfix_error
and int(content
['Status'][:1]) != 5:
162 print '! Not a permanent failure status (%s).' % content
['Status']
166 #----------------------------------------------------------------------------#
168 class DirectBouncesFilter(MboxFilter
):
170 def initialize(self
, mbox_file
):
173 self
.mbox_file
= '%s.bounced' % mbox_file
174 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
177 def process(self
, message
):
178 if message
['X-Spam-Flag'] is None:
179 # During finalization, we will verifiy that all messages were processed
181 # Additionnal checks, just to be sure
182 if message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
183 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
185 email
= findAddressInBounce(message
)
186 if email
is not None:
187 self
.emails
.append(email
)
188 self
.mbox
.add(message
)
193 print 'Found %d messages with no X-Spam-Flag header.' % self
.seen
194 print 'Found %d of them that are confirmed bounces.' %
len(self
.mbox
)
195 if self
.seen
!= len(self
.mbox
):
196 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
197 print 'They were saved in %s.' % self
.mbox_file
199 print 'Here is the list of email adresses for these bounces:'
201 for email
in self
.emails
:
206 #----------------------------------------------------------------------------#
208 class SpamFilter(MboxFilter
):
210 def initialize(self
, mbox_file
):
211 self
.mbox_file
= '%s.spam' % mbox_file
212 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
215 def process(self
, message
):
216 if message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
217 self
.mbox
.add(message
)
222 print 'Found %d spams. This is reliable.' %
len(self
.mbox
)
223 print 'They were saved in %s.' % self
.mbox_file
224 print 'You might check the contents of this mbox.'
227 #----------------------------------------------------------------------------#
229 class UnsureFilter(MboxFilter
):
231 def initialize(self
, mbox_file
):
232 self
.mbox_file
= '%s.unsure' % mbox_file
233 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
236 def process(self
, message
):
237 if message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
238 self
.mbox
.add(message
)
243 print 'Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
)
244 print 'They were saved in %s.' % self
.mbox_file
245 print 'You must check the contents of this mbox and feed the antispam.'
248 #----------------------------------------------------------------------------#
250 class CheckNonSpamFilter(MboxFilter
):
252 def initialize(self
, mbox_file
):
255 def process(self
, message
):
256 if not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
262 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.counter
263 print 'Please investigate.'
265 print 'All messages were either spam, or unsure, or non-spams. Good.'
267 #----------------------------------------------------------------------------#
269 class OutOfOfficeFilter(MboxFilter
):
271 def initialize(self
, mbox_file
):
272 self
.mbox_file
= '%s.ooo' % mbox_file
273 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
278 r
'^Out of (the )?office',
279 r
'is out of (the )?office',
281 r
'automatique d\'absence
',
282 r'Notification d
\'absence
'
283 u'RĂ©ponse automatique
:', #unicode!
286 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
288 def process(self, message):
289 subject = findSubject(message)
290 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
291 self.mbox.add(message)
296 print 'Found %d
"out of office". This
is generally reliable
.' % len(self.mbox)
297 print 'They were saved
in %s
.' % self.mbox_file
298 print 'You may check the contents of this mbox
.'
301 #----------------------------------------------------------------------------#
303 class DeliveryStatusNotificationFilter(MboxFilter):
305 def initialize(self, mbox_file):
307 self.mbox_file = '%s
.dsn
' % mbox_file
308 self.mbox = mailbox.mbox(self.mbox_file)
311 def process(self, message):
312 if message.get_content_type() == 'multipart
/report
':
313 email = findAddressInBounce(message)
314 if email is not None:
315 self.emails.append(email)
316 self.mbox.add(message)
321 print 'Found %d delivery status notifications
. This
is generally reliable
.' % len(self.mbox)
322 print 'They were saved
in %s
.' % self.mbox_file
324 print 'Here
is the
list of email adresses
for these bounces
:'
326 for email in self.emails:
331 #----------------------------------------------------------------------------#
333 class CatchAllFilter(MboxFilter):
335 def initialize(self, mbox_file):
336 self.mbox_file = '%s
.catchall
' % mbox_file
337 self.mbox = mailbox.mbox(self.mbox_file)
340 def process(self, message):
341 self.mbox.add(message)
345 if len(self.mbox) > 0:
346 print '%d messages reached the catchall
.' % len(self.mbox)
347 print 'They were saved
in %s
.' % self.mbox_file
348 print 'You must process the contents of this mbox manually
.'
351 print 'No messages reached the catchall
. Nice
.'
353 os.unlink(self.mbox_file)
355 #----------------------------------------------------------------------------#
357 if __name__ == '__main__
':
359 if len(sys.argv) != 2:
360 print 'Usage
: %s mbox
' % sys.argv[0]
363 if not os.path.exists(sys.argv[1]):
364 print 'No such
file: %s
' % sys.argv[1]
367 processor = MboxProcessor(sys.argv[1])