2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
128 def findAddressInBounce(bounce
):
129 """Finds the faulty email address in a bounced email.
131 See RFC 1894 for more information.
132 Returns None or the email address."""
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce
.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
138 # Extract the second component of the multipart/report
139 num_payloads
= len(bounce
.get_payload())
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
143 status
= bounce
.get_payload(1)
144 if status
.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status
.get_content_type())
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads
= len(status
.get_payload())
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
152 content
= status
.get_payload(1)
153 if content
.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % content
.get_content_type())
156 # Extract the faulty email address
157 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
158 if recipient_match
is None:
159 print('! Missing final recipient.')
161 email
= recipient_match
.group(1)
162 # Check the action field
163 if content
['Action'].lower() != 'failed':
164 print('! Not a failed action (%s).' % content
['Action'])
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error
= content
['Diagnostic-Code'] is not None \
169 and content
['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error
and int(content
['Status'][:1]) != 5:
171 print('! Not a permanent failure status (%s).' % content
['Status'])
175 #----------------------------------------------------------------------------#
177 class DirectBouncesFilter(MboxFilter
):
179 def initialize(self
, mbox_file
):
181 self
.bad_problems
= 0
183 self
.mbox_file
= '%s.bounced' % mbox_file
184 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
187 def process(self
, message
):
188 if message
['X-Spam-Flag'] is None:
189 # During finalization, we will verifiy that all messages were processed
191 # Special case: ignore mailman notifications for the mailing-list
192 # on which the NL is forwarded
193 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
194 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
197 # Additionnal checks, just to be sure
198 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
199 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
200 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
202 email
= findAddressInBounce(message
)
203 if email
is not None:
204 self
.emails
.append(email
)
205 self
.mbox
.add(message
)
208 print('! => No email found in direct bounce, this is really bad.')
209 self
.bad_problems
+= 1
213 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
214 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
215 print('They were saved in %s.' % self
.mbox_file
)
216 if self
.bad_problems
:
217 print('Found %d of them that are invalid.' % self
.bad_problems
)
218 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
219 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
221 print('Here is the list of email adresses for these bounces:')
223 for email
in self
.emails
:
228 #----------------------------------------------------------------------------#
230 class SpamFilter(MboxFilter
):
232 def initialize(self
, mbox_file
):
233 self
.mbox_file
= '%s.spam' % mbox_file
234 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
237 def process(self
, message
):
238 if message
['X-Spam-Flag'] is not None \
239 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
240 self
.mbox
.add(message
)
245 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
246 print('They were saved in %s.' % self
.mbox_file
)
247 print('You might check the contents of this mbox.')
250 #----------------------------------------------------------------------------#
252 class UnsureFilter(MboxFilter
):
254 def initialize(self
, mbox_file
):
255 self
.mbox_file
= '%s.unsure' % mbox_file
256 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
259 def process(self
, message
):
260 if message
['X-Spam-Flag'] is not None \
261 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
262 self
.mbox
.add(message
)
267 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
268 print('They were saved in %s.' % self
.mbox_file
)
269 print('You must check the contents of this mbox and feed the antispam.')
272 #----------------------------------------------------------------------------#
274 class CheckNonSpamFilter(MboxFilter
):
276 def initialize(self
, mbox_file
):
279 def process(self
, message
):
280 if message
['X-Spam-Flag'] is None \
281 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
287 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
288 print('Please investigate.')
290 print('All messages were either spam, or unsure, or non-spams. Good.')
292 #----------------------------------------------------------------------------#
294 class OutOfOfficeFilter(MboxFilter
):
296 def initialize(self
, mbox_file
):
297 self
.mbox_file
= '%s.ooo' % mbox_file
298 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
302 r
'^(AUTO: )?Out of (the )?office',
304 r
'^Automatic reply: ',
305 r
'automatique d\'absence
',
309 r'I am currently away
',
310 r'(am|
is) out
of (the
)?office
',
311 r'Notification d
\'absence
',
312 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
314 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
316 def process(self, message):
317 subject = findSubject(message)
318 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
319 self.mbox.add(message)
322 # Some systems reply with "Re: ". Be smart here!
323 if subject is not None and subject.startswith('Re
: '):
324 # Delivered-To: Autoresponder
325 if 'Autoresponder
' in message.get_all('Delivered
-To
'):
326 self.mbox.add(message)
328 # Parse content if it is simple enough
329 if message.get_content_type() == 'text
/plain
':
330 firstline = message.get_payload().splitlines()[0].lower()
331 if (' absent du bureau
' in firstline
332 or ' away
from my office
' in firstline):
333 self.mbox.add(message)
339 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
340 print('They were saved
in %s
.' % self.mbox_file)
341 print('You may check the contents of this mbox
.')
344 #----------------------------------------------------------------------------#
346 class DeliveryStatusNotificationFilter(MboxFilter):
348 def initialize(self, mbox_file):
350 self.mbox_file = '%s
.dsn
' % mbox_file
351 self.mbox = mailbox.mbox(self.mbox_file)
353 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
354 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
355 self.mbox_temp.clear()
357 def process(self, message):
358 if message.get_content_type() == 'multipart
/report
':
359 email = findAddressInBounce(message)
360 if email is not None:
361 self.emails.append(email)
362 self.mbox.add(message)
365 print("! => Moved to temporary DSN mailbox")
366 self.mbox_temp.add(message)
371 print('Found %d delivery status notifications
. This
is generally reliable
.' % len(self.mbox))
372 print('They were saved
in %s
.' % self.mbox_file)
374 print('Here
is the
list of email adresses
for these bounces
:')
376 for email in self.emails:
380 print('Found %d temporary
and invalid delivery status notifications
.' % len(self.mbox_temp))
381 print('They were saved
in %s
.' % self.mbox_temp_file)
382 self.mbox_temp.close()
384 #----------------------------------------------------------------------------#
386 class CatchAllFilter(MboxFilter):
388 def initialize(self, mbox_file):
389 self.mbox_file = '%s
.catchall
' % mbox_file
390 self.mbox = mailbox.mbox(self.mbox_file)
393 def process(self, message):
394 self.mbox.add(message)
398 if len(self.mbox) > 0:
399 print('%d messages reached the catchall
.' % len(self.mbox))
400 print('They were saved
in %s
.' % self.mbox_file)
401 print('You must process the contents of this mbox manually
.')
404 print('No messages reached the catchall
. Nice
.')
406 os.unlink(self.mbox_file)
408 #----------------------------------------------------------------------------#
410 if __name__ == '__main__
':
412 if len(sys.argv) != 2:
413 print('Usage
: %s mbox
' % sys.argv[0])
416 if not os.path.exists(sys.argv[1]):
417 print('No such
file: %s
' % sys.argv[1])
420 processor = MboxProcessor(sys.argv[1])