e3972d6f610626d176c23dee742e20a0e4f4d74c
1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2008 polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
23 # Copyright (c) 2008 Aymeric Augustin
26 Process as automatically as possible bounces from the newsletter
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
34 All emails are saved in different mailboxes to make human post-processing easier.
37 import email
, mailbox
, os
, re
, sys
, time
39 #----------------------------------------------------------------------------#
42 """Applies a series of filters to each message in a mbox."""
44 def __init__(self
, mbox
):
46 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
48 DirectBouncesFilter(),
53 DeliveryStatusNotificationFilter(),
57 def initialize_filters(self
):
58 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
59 self
.start_time
= time
.clock()
61 def apply_filters(self
, message
):
62 return any(f
.process(message
) for f
in self
.filters
)
64 def finalize_filters(self
):
65 duration
= time
.clock() - self
.start_time
68 print 'Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
)
70 for f
in self
.filters
:
77 self
.initialize_filters()
78 for message
in self
.mbox
: self
.apply_filters(message
)
79 self
.finalize_filters()
84 #----------------------------------------------------------------------------#
87 """Defines an interface for filters."""
89 def initialize(self
, mbox_file
):
90 """Called by the processor before processing starts.
92 This is the place to open descriptors required during processing."""
95 def process(self
, message
):
96 """Called by the processor for each message that reaches this step.
98 Return true to stop processing, and false to go to the next filter."""
102 """Called by the processor after processing ends.
104 This is the place to display the results and close all descriptors."""
107 #----------------------------------------------------------------------------#
109 def findSubject(message
):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message
['Subject'] is not None:
113 return unicode(email
.header
.make_header(email
.header
.decode_header(message
['Subject'])))
118 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
120 def findAddressInBounce(bounce
):
121 """Finds the faulty email address in a bounced email.
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce
.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type()
129 # Extract the second component of the multipart/report
130 if len(bounce
.get_payload()) < 2:
131 print '! Not a valid bounce (expected at least 2 parts, found %d).' %
len(bounce
)
133 status
= bounce
.get_payload(1)
134 if status
.get_content_type() != 'message/delivery-status':
135 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce
.get_content_type()
137 # The per-message-fields don't matter here, get only the per-recipient-fields
138 if len(status
.get_payload()) < 2:
139 print '! Not a valid bounce (expected at least 2 parts, found %d).' %
len(status
)
141 content
= status
.get_payload(1)
142 if content
.get_content_type() != 'text/plain':
143 print '! Not a valid bounce (expected text/plain, found %s).' % bounce
.get_content_type
145 # Extract the faulty email address
146 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
147 if recipient_match
is None:
148 print '! Missing final recipient.'
150 email
= recipient_match
.group(1)
151 # Check the action field
152 if content
['Action'] != 'failed':
153 print '! Not a failed action (%s).' % content
['Action']
155 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
156 # Otherwise, the first sub-field should indicate a permanent failure
157 postfix_error
= content
['Diagnostic-Code'] is not None \
158 and content
['Diagnostic-Code'].startswith('X-Postfix')
159 if not postfix_error
and int(content
['Status'][:1]) != 5:
160 print '! Not a permanent failure status (%s).' % content
['Status']
164 #----------------------------------------------------------------------------#
166 class DirectBouncesFilter(MboxFilter
):
168 def initialize(self
, mbox_file
):
171 self
.mbox_file
= '%s.bounced' % mbox_file
172 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
175 def process(self
, message
):
176 if message
['X-Spam-Flag'] is None:
177 # During finalization, we will verifiy that all messages were processed
179 # Additionnal checks, just to be sure
180 if message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
181 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
183 email
= findAddressInBounce(message
)
184 if email
is not None:
185 self
.emails
.append(email
)
186 self
.mbox
.add(message
)
191 print 'Found %d messages with no X-Spam-Flag header.' % self
.seen
192 print 'Found %d of them that are confirmed bounces.' %
len(self
.mbox
)
193 if self
.seen
!= len(self
.mbox
):
194 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
195 print 'They were saved in %s.' % self
.mbox_file
197 print 'Here is the list of email adresses for these bounces:'
199 for email
in self
.emails
:
204 #----------------------------------------------------------------------------#
206 class SpamFilter(MboxFilter
):
208 def initialize(self
, mbox_file
):
209 self
.mbox_file
= '%s.spam' % mbox_file
210 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
213 def process(self
, message
):
214 if message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
215 self
.mbox
.add(message
)
220 print 'Found %d spams. This is reliable.' %
len(self
.mbox
)
221 print 'They were saved in %s.' % self
.mbox_file
222 print 'You might check the contents of this mbox.'
225 #----------------------------------------------------------------------------#
227 class UnsureFilter(MboxFilter
):
229 def initialize(self
, mbox_file
):
230 self
.mbox_file
= '%s.unsure' % mbox_file
231 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
234 def process(self
, message
):
235 if message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
236 self
.mbox
.add(message
)
241 print 'Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
)
242 print 'They were saved in %s.' % self
.mbox_file
243 print 'You must check the contents of this mbox and feed the antispam.'
246 #----------------------------------------------------------------------------#
248 class CheckNonSpamFilter(MboxFilter
):
250 def initialize(self
, mbox_file
):
253 def process(self
, message
):
254 if not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
260 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.counter
261 print 'Please investigate.'
263 print 'All messages were either spam, or unsure, or non-spams. Good.'
265 #----------------------------------------------------------------------------#
267 class OutOfOfficeFilter(MboxFilter
):
269 def initialize(self
, mbox_file
):
270 self
.mbox_file
= '%s.ooo' % mbox_file
271 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
277 r
'is out of (the )?office',
278 u
'^RĂ©ponse automatique d\'absence du bureau', # unicode!
280 self
.subject_regexes
= map(re
.compile, subject_re
, [re
.I | re
.U
] * len(subject_re
))
282 def process(self
, message
):
283 subject
= findSubject(message
)
284 if subject
is not None and any(regex
.search(subject
) for regex
in self
.subject_regexes
):
285 self
.mbox
.add(message
)
290 print 'Found %d "out of office". This is generally reliable.' %
len(self
.mbox
)
291 print 'They were saved in %s.' % self
.mbox_file
292 print 'You may check the contents of this mbox.'
295 #----------------------------------------------------------------------------#
297 class DeliveryStatusNotificationFilter(MboxFilter
):
299 def initialize(self
, mbox_file
):
301 self
.mbox_file
= '%s.dsn' % mbox_file
302 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
305 r
'^DELIVERY FAILURE: ',
306 r
'^Delivery Notification: Delivery has failed$',
307 r
'^Delivery Status Notification ?\(Failure\)$',
308 r
'^Mail delivery failed',
309 r
'^(Mail revenu en erreur / )?Undelivered Mail Returned to Sender$',
310 r
'^Returned mail: see transcript for details$',
311 r
'^Undeliverable( mail)?:',
312 r
'^Undelivered Mail Returned to Sender$',
314 self
.subject_regexes
= map(re
.compile, subject_re
, [re
.I | re
.U
] * len(subject_re
))
316 def process(self
, message
):
317 subject
= findSubject(message
)
318 if subject
is not None and any(regex
.search(subject
) for regex
in self
.subject_regexes
):
319 email
= findAddressInBounce(message
)
320 if email
is not None:
321 self
.emails
.append(email
)
322 self
.mbox
.add(message
)
327 print 'Found %d delivery status notifications. This is generally reliable.' %
len(self
.mbox
)
328 print 'They were saved in %s.' % self
.mbox_file
330 print 'Here is the list of email adresses for these bounces:'
332 for email
in self
.emails
:
337 #----------------------------------------------------------------------------#
339 class CatchAllFilter(MboxFilter
):
341 def initialize(self
, mbox_file
):
342 self
.mbox_file
= '%s.catchall' % mbox_file
343 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
346 def process(self
, message
):
347 self
.mbox
.add(message
)
351 if len(self
.mbox
) > 0:
352 print '%d messages reached the catchall.' %
len(self
.mbox
)
353 print 'They were saved in %s.' % self
.mbox_file
354 print 'You must process the contents of this mbox manually.'
357 print 'No messages reached the catchall. Nice.'
359 os
.unlink(self
.mbox_file
)
361 #----------------------------------------------------------------------------#
363 if __name__
== '__main__':
365 if len(sys
.argv
) != 2:
366 print 'Usage: %s mbox' % sys
.argv
[0]
369 if not os
.path
.exists(sys
.argv
[1]):
370 print 'No such file: %s' % sys
.argv
[1]
373 processor
= MboxProcessor(sys
.argv
[1])