NL bounces: improve Out-of-Office detection
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
6208fd26 1#!/usr/bin/env python
58e64caf
AA
2# -*- coding: utf-8 -*-
3#***************************************************************************
ba6ae046 4#* Copyright (C) 2003-2013 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
6208fd26
NI
35import email
36import mailbox
37import os
38import re
39import sys
40import time
58e64caf
AA
41
42#----------------------------------------------------------------------------#
43
44class MboxProcessor:
45 """Applies a series of filters to each message in a mbox."""
46
47 def __init__(self, mbox):
48 self.mbox_file = mbox
49 self.mbox = mailbox.mbox(self.mbox_file)
50 self.filters = [
51 DirectBouncesFilter(),
52 SpamFilter(),
53 UnsureFilter(),
54 CheckNonSpamFilter(),
55 OutOfOfficeFilter(),
56 DeliveryStatusNotificationFilter(),
57 CatchAllFilter()
58 ]
59
60 def initialize_filters(self):
61 for f in self.filters: f.initialize(self.mbox_file)
62 self.start_time = time.clock()
63
64 def apply_filters(self, message):
65 return any(f.process(message) for f in self.filters)
66
67 def finalize_filters(self):
68 duration = time.clock() - self.start_time
69 separator = '-' * 80
6208fd26
NI
70 print(separator)
71 print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration))
72 print(separator)
58e64caf 73 for f in self.filters:
6208fd26
NI
74 f.finalize()
75 print(separator)
58e64caf
AA
76
77 def run(self):
78 self.mbox.lock()
79 try:
80 self.initialize_filters()
81 for message in self.mbox: self.apply_filters(message)
82 self.finalize_filters()
83 finally:
84 self.mbox.unlock()
85 self.mbox.close()
86
87#----------------------------------------------------------------------------#
88
89class MboxFilter:
90 """Defines an interface for filters."""
91
92 def initialize(self, mbox_file):
93 """Called by the processor before processing starts.
6208fd26 94
58e64caf
AA
95 This is the place to open descriptors required during processing."""
96 pass
97
98 def process(self, message):
99 """Called by the processor for each message that reaches this step.
6208fd26 100
58e64caf
AA
101 Return true to stop processing, and false to go to the next filter."""
102 pass
103
104 def finalize(self):
105 """Called by the processor after processing ends.
6208fd26 106
58e64caf
AA
107 This is the place to display the results and close all descriptors."""
108 pass
109
110#----------------------------------------------------------------------------#
111
112def findSubject(message):
113 """Returns the subject of an email.Message as an unicode string."""
6208fd26
NI
114 if message['Subject'] is None:
115 return None
116
117 # decode_header returns a list of (decoded_string, charset) pairs
118 decoded_seq = email.header.decode_header(message['Subject'])
119 decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq]
120 header = email.header.make_header(decoded_seq)
121 # Be Python 2 & 3 compatible
122 return unicode(header) if sys.version_info < (3,) else str(header)
123
58e64caf
AA
124
125_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
126
6208fd26 127
58e64caf
AA
128def findAddressInBounce(bounce):
129 """Finds the faulty email address in a bounced email.
6208fd26 130
58e64caf
AA
131 See RFC 1894 for more information.
132 Returns None or the email address."""
6208fd26 133
58e64caf
AA
134 # Check that it is a bounce - a few MTA fail to set this correctly :(
135 if bounce.get_content_type() != 'multipart/report':
6208fd26 136 print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type())
58e64caf
AA
137 return None
138 # Extract the second component of the multipart/report
aa6c6ed4
AA
139 num_payloads = len(bounce.get_payload())
140 if num_payloads < 2:
6208fd26 141 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
142 return None
143 status = bounce.get_payload(1)
144 if status.get_content_type() != 'message/delivery-status':
4aad6c9c 145 print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type())
58e64caf
AA
146 return None
147 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
148 num_payloads = len(status.get_payload())
149 if num_payloads < 2:
6208fd26 150 print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads)
58e64caf
AA
151 return None
152 content = status.get_payload(1)
153 if content.get_content_type() != 'text/plain':
4aad6c9c 154 print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type())
58e64caf
AA
155 return None
156 # Extract the faulty email address
157 recipient_match = _recipient_re.search(content['Final-Recipient'])
158 if recipient_match is None:
6208fd26 159 print('! Missing final recipient.')
58e64caf
AA
160 return None
161 email = recipient_match.group(1)
162 # Check the action field
4aad6c9c 163 if content['Action'].lower() != 'failed':
6208fd26 164 print('! Not a failed action (%s).' % content['Action'])
58e64caf
AA
165 return None
166 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
167 # Otherwise, the first sub-field should indicate a permanent failure
168 postfix_error = content['Diagnostic-Code'] is not None \
169 and content['Diagnostic-Code'].startswith('X-Postfix')
170 if not postfix_error and int(content['Status'][:1]) != 5:
6208fd26 171 print('! Not a permanent failure status (%s).' % content['Status'])
58e64caf
AA
172 return None
173 return email
174
175#----------------------------------------------------------------------------#
176
177class DirectBouncesFilter(MboxFilter):
178
179 def initialize(self, mbox_file):
180 self.seen = 0
6208fd26 181 self.bad_problems = 0
58e64caf
AA
182 self.emails = []
183 self.mbox_file = '%s.bounced' % mbox_file
184 self.mbox = mailbox.mbox(self.mbox_file)
185 self.mbox.clear()
186
187 def process(self, message):
188 if message['X-Spam-Flag'] is None:
189 # During finalization, we will verifiy that all messages were processed
190 self.seen += 1
e0c82ac8
AA
191 # Special case: ignore mailman notifications for the mailing-list
192 # on which the NL is forwarded
6208fd26
NI
193 if message['From'] == 'newsletter-externes-bounces@polytechnique.org':
194 print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.')
e0c82ac8
AA
195 self.seen -= 1
196 return True
58e64caf 197 # Additionnal checks, just to be sure
e0c82ac8 198 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 199 or message['Subject'] != 'Undelivered Mail Returned to Sender':
6208fd26 200 print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']))
e0c82ac8
AA
201 else:
202 email = findAddressInBounce(message)
203 if email is not None:
204 self.emails.append(email)
205 self.mbox.add(message)
206 return True
207 else:
6208fd26
NI
208 print('! => No email found in direct bounce, this is really bad.')
209 self.bad_problems += 1
58e64caf
AA
210 return False
211
212 def finalize(self):
6208fd26
NI
213 print('Found %d messages with no X-Spam-Flag header.' % self.seen)
214 print('Found %d of them that are confirmed bounces.' % len(self.mbox))
215 print('They were saved in %s.' % self.mbox_file)
216 if self.bad_problems:
217 print('Found %d of them that are invalid.' % self.bad_problems)
218 if self.seen != len(self.mbox) + self.bad_problems:
219 print(' /!\ These numbers shoud be equal! We have a problem! /!\\')
220 print('')
221 print('Here is the list of email adresses for these bounces:')
222 print('')
58e64caf 223 for email in self.emails:
6208fd26
NI
224 print(email)
225 print('')
58e64caf
AA
226 self.mbox.close()
227
228#----------------------------------------------------------------------------#
229
230class SpamFilter(MboxFilter):
231
232 def initialize(self, mbox_file):
233 self.mbox_file = '%s.spam' % mbox_file
234 self.mbox = mailbox.mbox(self.mbox_file)
235 self.mbox.clear()
236
237 def process(self, message):
e0c82ac8
AA
238 if message['X-Spam-Flag'] is not None \
239 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
240 self.mbox.add(message)
241 return True
242 return False
243
244 def finalize(self):
6208fd26
NI
245 print('Found %d spams. This is reliable.' % len(self.mbox))
246 print('They were saved in %s.' % self.mbox_file)
247 print('You might check the contents of this mbox.')
58e64caf
AA
248 self.mbox.close()
249
250#----------------------------------------------------------------------------#
251
252class UnsureFilter(MboxFilter):
253
254 def initialize(self, mbox_file):
255 self.mbox_file = '%s.unsure' % mbox_file
256 self.mbox = mailbox.mbox(self.mbox_file)
257 self.mbox.clear()
258
259 def process(self, message):
e0c82ac8
AA
260 if message['X-Spam-Flag'] is not None \
261 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
262 self.mbox.add(message)
263 return True
264 return False
265
266 def finalize(self):
6208fd26
NI
267 print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox))
268 print('They were saved in %s.' % self.mbox_file)
269 print('You must check the contents of this mbox and feed the antispam.')
58e64caf
AA
270 self.mbox.close()
271
272#----------------------------------------------------------------------------#
273
274class CheckNonSpamFilter(MboxFilter):
275
276 def initialize(self, mbox_file):
277 self.seen = 0
278
279 def process(self, message):
e0c82ac8
AA
280 if message['X-Spam-Flag'] is None \
281 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
282 self.seen += 1
283 return False
284
285 def finalize(self):
286 if self.seen > 0:
6208fd26
NI
287 print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen)
288 print('Please investigate.')
58e64caf 289 else:
6208fd26 290 print('All messages were either spam, or unsure, or non-spams. Good.')
58e64caf
AA
291
292#----------------------------------------------------------------------------#
293
294class OutOfOfficeFilter(MboxFilter):
295
296 def initialize(self, mbox_file):
297 self.mbox_file = '%s.ooo' % mbox_file
298 self.mbox = mailbox.mbox(self.mbox_file)
299 self.mbox.clear()
300 subject_re = [
301 r'^Absen(t|ce)',
6208fd26 302 r'^(AUTO: )?Out of (the )?office',
fc643049 303 r'^Auto( ?): ',
6208fd26 304 r'^Automatic reply: ',
aa6c6ed4 305 r'automatique d\'absence',
aa6c6ed4 306 r'AutoReply',
6208fd26
NI
307 r'(est|is) absent',
308 r'I am out of town',
309 r'I am currently away',
fc643049 310 r'(am|is) out of (the )?office',
6208fd26
NI
311 r'Notification d\'absence',
312 r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute
58e64caf 313 ]
6208fd26 314 self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re]
58e64caf
AA
315
316 def process(self, message):
317 subject = findSubject(message)
318 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
319 self.mbox.add(message)
320 return True
fc643049
NI
321
322 # Some systems reply with "Re: ". Be smart here!
323 if subject is not None and subject.startswith('Re: '):
324 # Delivered-To: Autoresponder
325 if 'Autoresponder' in message.get_all('Delivered-To'):
326 self.mbox.add(message)
327 return True
328 # Parse content if it is simple enough
329 if message.get_content_type() == 'text/plain':
330 firstline = message.get_payload().splitlines()[0].lower()
331 if (' absent du bureau ' in firstline
332 or ' away from my office ' in firstline):
333 self.mbox.add(message)
334 return True
335
58e64caf
AA
336 return False
337
338 def finalize(self):
6208fd26
NI
339 print('Found %d "out of office". This is generally reliable.' % len(self.mbox))
340 print('They were saved in %s.' % self.mbox_file)
341 print('You may check the contents of this mbox.')
58e64caf
AA
342 self.mbox.close()
343
344#----------------------------------------------------------------------------#
345
346class DeliveryStatusNotificationFilter(MboxFilter):
347
348 def initialize(self, mbox_file):
349 self.emails = []
350 self.mbox_file = '%s.dsn' % mbox_file
351 self.mbox = mailbox.mbox(self.mbox_file)
352 self.mbox.clear()
6208fd26
NI
353 self.mbox_temp_file = '%s.dsn-temp' % mbox_file
354 self.mbox_temp = mailbox.mbox(self.mbox_temp_file)
355 self.mbox_temp.clear()
58e64caf
AA
356
357 def process(self, message):
aa6c6ed4 358 if message.get_content_type() == 'multipart/report':
58e64caf
AA
359 email = findAddressInBounce(message)
360 if email is not None:
361 self.emails.append(email)
362 self.mbox.add(message)
363 return True
6208fd26
NI
364 else:
365 print("! => Moved to temporary DSN mailbox")
366 self.mbox_temp.add(message)
367 return True
58e64caf
AA
368 return False
369
370 def finalize(self):
6208fd26
NI
371 print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox))
372 print('They were saved in %s.' % self.mbox_file)
373 print('')
374 print('Here is the list of email adresses for these bounces:')
375 print('')
58e64caf 376 for email in self.emails:
6208fd26
NI
377 print(email)
378 print('')
58e64caf 379 self.mbox.close()
6208fd26
NI
380 print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp))
381 print('They were saved in %s.' % self.mbox_temp_file)
382 self.mbox_temp.close()
58e64caf
AA
383
384#----------------------------------------------------------------------------#
385
386class CatchAllFilter(MboxFilter):
387
388 def initialize(self, mbox_file):
389 self.mbox_file = '%s.catchall' % mbox_file
390 self.mbox = mailbox.mbox(self.mbox_file)
391 self.mbox.clear()
392
393 def process(self, message):
394 self.mbox.add(message)
395 return True
396
397 def finalize(self):
398 if len(self.mbox) > 0:
6208fd26
NI
399 print('%d messages reached the catchall.' % len(self.mbox))
400 print('They were saved in %s.' % self.mbox_file)
401 print('You must process the contents of this mbox manually.')
58e64caf
AA
402 self.mbox.close()
403 else:
6208fd26 404 print('No messages reached the catchall. Nice.')
58e64caf
AA
405 self.mbox.close()
406 os.unlink(self.mbox_file)
407
408#----------------------------------------------------------------------------#
409
410if __name__ == '__main__':
411
412 if len(sys.argv) != 2:
6208fd26 413 print('Usage: %s mbox' % sys.argv[0])
58e64caf
AA
414 sys.exit(1)
415
416 if not os.path.exists(sys.argv[1]):
6208fd26 417 print('No such file: %s' % sys.argv[1])
58e64caf
AA
418 sys.exit(1)
419
420 processor = MboxProcessor(sys.argv[1])
421 processor.run()