2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2013 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
24 Process as automatically as possible bounces from the newsletter
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
32 All emails are saved in different mailboxes to make human post-processing easier.
42 #----------------------------------------------------------------------------#
45 """Applies a series of filters to each message in a mbox."""
47 def __init__(self
, mbox
):
49 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
51 DirectBouncesFilter(),
56 DeliveryStatusNotificationFilter(),
60 def initialize_filters(self
):
61 for f
in self
.filters
: f
.initialize(self
.mbox_file
)
62 self
.start_time
= time
.clock()
64 def apply_filters(self
, message
):
65 return any(f
.process(message
) for f
in self
.filters
)
67 def finalize_filters(self
):
68 duration
= time
.clock() - self
.start_time
71 print('Processed the %d messages of %s in %.2fs' %
(len(self
.mbox
), self
.mbox_file
, duration
))
73 for f
in self
.filters
:
80 self
.initialize_filters()
81 for message
in self
.mbox
: self
.apply_filters(message
)
82 self
.finalize_filters()
87 #----------------------------------------------------------------------------#
90 """Defines an interface for filters."""
92 def initialize(self
, mbox_file
):
93 """Called by the processor before processing starts.
95 This is the place to open descriptors required during processing."""
98 def process(self
, message
):
99 """Called by the processor for each message that reaches this step.
101 Return true to stop processing, and false to go to the next filter."""
105 """Called by the processor after processing ends.
107 This is the place to display the results and close all descriptors."""
110 #----------------------------------------------------------------------------#
112 def findSubject(message
):
113 """Returns the subject of an email.Message as an unicode string."""
114 if message
['Subject'] is None:
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq
= email
.header
.decode_header(message
['Subject'])
119 decoded_seq
= [(subj
, enc
or 'utf-8') for subj
, enc
in decoded_seq
]
120 header
= email
.header
.make_header(decoded_seq
)
121 # Be Python 2 & 3 compatible
122 return unicode(header
) if sys
.version_info
< (3,) else str(header
)
125 _recipient_re
= re
.compile(r
'^rfc822; ?(.+)$', re
.I | re
.U
)
128 def findAddressInBounce(bounce
):
129 """Finds the faulty email address in a bounced email.
131 See RFC 1894 for more information.
132 Returns None or the email address."""
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce
.get_content_type() != 'multipart/report':
136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce
.get_content_type())
138 # Extract the second component of the multipart/report
139 num_payloads
= len(bounce
.get_payload())
141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
143 status
= bounce
.get_payload(1)
144 if status
.get_content_type() != 'message/delivery-status':
145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % bounce
.get_content_type())
147 # The per-message-fields don't matter here, get only the per-recipient-fields
148 num_payloads
= len(status
.get_payload())
150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
)
152 content
= status
.get_payload(1)
153 if content
.get_content_type() != 'text/plain':
154 print('! Not a valid bounce (expected text/plain, found %s).' % bounce
.get_content_type
)
156 # Extract the faulty email address
157 recipient_match
= _recipient_re
.search(content
['Final-Recipient'])
158 if recipient_match
is None:
159 print('! Missing final recipient.')
161 email
= recipient_match
.group(1)
162 # Check the action field
163 if content
['Action'] != 'failed':
164 print('! Not a failed action (%s).' % content
['Action'])
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error
= content
['Diagnostic-Code'] is not None \
169 and content
['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error
and int(content
['Status'][:1]) != 5:
171 print('! Not a permanent failure status (%s).' % content
['Status'])
175 #----------------------------------------------------------------------------#
177 class DirectBouncesFilter(MboxFilter
):
179 def initialize(self
, mbox_file
):
181 self
.bad_problems
= 0
183 self
.mbox_file
= '%s.bounced' % mbox_file
184 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
187 def process(self
, message
):
188 if message
['X-Spam-Flag'] is None:
189 # During finalization, we will verifiy that all messages were processed
191 # Special case: ignore mailman notifications for the mailing-list
192 # on which the NL is forwarded
193 if message
['From'] == 'newsletter-externes-bounces@polytechnique.org':
194 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
197 # Additionnal checks, just to be sure
198 elif message
['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
199 or message
['Subject'] != 'Undelivered Mail Returned to Sender':
200 print('! Not an usual direct bounce (From="%s", Subject="%s").' %
(message
['From'], message
['Subject']))
202 email
= findAddressInBounce(message
)
203 if email
is not None:
204 self
.emails
.append(email
)
205 self
.mbox
.add(message
)
208 print('! => No email found in direct bounce, this is really bad.')
209 self
.bad_problems
+= 1
213 print('Found %d messages with no X-Spam-Flag header.' % self
.seen
)
214 print('Found %d of them that are confirmed bounces.' %
len(self
.mbox
))
215 print('They were saved in %s.' % self
.mbox_file
)
216 if self
.bad_problems
:
217 print('Found %d of them that are invalid.' % self
.bad_problems
)
218 if self
.seen
!= len(self
.mbox
) + self
.bad_problems
:
219 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
221 print('Here is the list of email adresses for these bounces:')
223 for email
in self
.emails
:
228 #----------------------------------------------------------------------------#
230 class SpamFilter(MboxFilter
):
232 def initialize(self
, mbox_file
):
233 self
.mbox_file
= '%s.spam' % mbox_file
234 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
237 def process(self
, message
):
238 if message
['X-Spam-Flag'] is not None \
239 and message
['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
240 self
.mbox
.add(message
)
245 print('Found %d spams. This is reliable.' %
len(self
.mbox
))
246 print('They were saved in %s.' % self
.mbox_file
)
247 print('You might check the contents of this mbox.')
250 #----------------------------------------------------------------------------#
252 class UnsureFilter(MboxFilter
):
254 def initialize(self
, mbox_file
):
255 self
.mbox_file
= '%s.unsure' % mbox_file
256 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
259 def process(self
, message
):
260 if message
['X-Spam-Flag'] is not None \
261 and message
['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
262 self
.mbox
.add(message
)
267 print('Found %d unclassified messages. Most of them should be spams.' %
len(self
.mbox
))
268 print('They were saved in %s.' % self
.mbox_file
)
269 print('You must check the contents of this mbox and feed the antispam.')
272 #----------------------------------------------------------------------------#
274 class CheckNonSpamFilter(MboxFilter
):
276 def initialize(self
, mbox_file
):
279 def process(self
, message
):
280 if message
['X-Spam-Flag'] is None \
281 or not message
['X-Spam-Flag'].startswith('No, tests=bogofilter'):
287 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self
.seen
)
288 print('Please investigate.')
290 print('All messages were either spam, or unsure, or non-spams. Good.')
292 #----------------------------------------------------------------------------#
294 class OutOfOfficeFilter(MboxFilter
):
296 def initialize(self
, mbox_file
):
297 self
.mbox_file
= '%s.ooo' % mbox_file
298 self
.mbox
= mailbox
.mbox(self
.mbox_file
)
302 r
'^(AUTO: )?Out of (the )?office',
303 r
'^Automatic reply: ',
304 r
'automatique d\'absence
',
308 r'I am currently away
',
309 r'is out
of (the
)?office
',
310 r'Notification d
\'absence
',
311 r'R
.{1,2}ponse
automatique( :)?
', # There may be encoding error of e acute
313 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
315 def process(self, message):
316 subject = findSubject(message)
317 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
318 self.mbox.add(message)
323 print('Found %d
"out of office". This
is generally reliable
.' % len(self.mbox))
324 print('They were saved
in %s
.' % self.mbox_file)
325 print('You may check the contents of this mbox
.')
328 #----------------------------------------------------------------------------#
330 class DeliveryStatusNotificationFilter(MboxFilter):
332 def initialize(self, mbox_file):
334 self.mbox_file = '%s
.dsn
' % mbox_file
335 self.mbox = mailbox.mbox(self.mbox_file)
337 self.mbox_temp_file = '%s
.dsn
-temp
' % mbox_file
338 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
339 self.mbox_temp.clear()
341 def process(self, message):
342 if message.get_content_type() == 'multipart
/report
':
343 email = findAddressInBounce(message)
344 if email is not None:
345 self.emails.append(email)
346 self.mbox.add(message)
349 print("! => Moved to temporary DSN mailbox")
350 self.mbox_temp.add(message)
355 print('Found %d delivery status notifications
. This
is generally reliable
.' % len(self.mbox))
356 print('They were saved
in %s
.' % self.mbox_file)
358 print('Here
is the
list of email adresses
for these bounces
:')
360 for email in self.emails:
364 print('Found %d temporary
and invalid delivery status notifications
.' % len(self.mbox_temp))
365 print('They were saved
in %s
.' % self.mbox_temp_file)
366 self.mbox_temp.close()
368 #----------------------------------------------------------------------------#
370 class CatchAllFilter(MboxFilter):
372 def initialize(self, mbox_file):
373 self.mbox_file = '%s
.catchall
' % mbox_file
374 self.mbox = mailbox.mbox(self.mbox_file)
377 def process(self, message):
378 self.mbox.add(message)
382 if len(self.mbox) > 0:
383 print('%d messages reached the catchall
.' % len(self.mbox))
384 print('They were saved
in %s
.' % self.mbox_file)
385 print('You must process the contents of this mbox manually
.')
388 print('No messages reached the catchall
. Nice
.')
390 os.unlink(self.mbox_file)
392 #----------------------------------------------------------------------------#
394 if __name__ == '__main__
':
396 if len(sys.argv) != 2:
397 print('Usage
: %s mbox
' % sys.argv[0])
400 if not os.path.exists(sys.argv[1]):
401 print('No such
file: %s
' % sys.argv[1])
404 processor = MboxProcessor(sys.argv[1])