e3972d6f610626d176c23dee742e20a0e4f4d74c
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2008 polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 # Copyright (c) 2008 Aymeric Augustin
24
25 """
26 Process as automatically as possible bounces from the newsletter
27
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
30
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
33
34 All emails are saved in different mailboxes to make human post-processing easier.
35 """
36
37 import email, mailbox, os, re, sys, time
38
39 #----------------------------------------------------------------------------#
40
41 class MboxProcessor:
42 """Applies a series of filters to each message in a mbox."""
43
44 def __init__(self, mbox):
45 self.mbox_file = mbox
46 self.mbox = mailbox.mbox(self.mbox_file)
47 self.filters = [
48 DirectBouncesFilter(),
49 SpamFilter(),
50 UnsureFilter(),
51 CheckNonSpamFilter(),
52 OutOfOfficeFilter(),
53 DeliveryStatusNotificationFilter(),
54 CatchAllFilter()
55 ]
56
57 def initialize_filters(self):
58 for f in self.filters: f.initialize(self.mbox_file)
59 self.start_time = time.clock()
60
61 def apply_filters(self, message):
62 return any(f.process(message) for f in self.filters)
63
64 def finalize_filters(self):
65 duration = time.clock() - self.start_time
66 separator = '-' * 80
67 print separator
68 print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
69 print separator
70 for f in self.filters:
71 f.finalize();
72 print separator
73
74 def run(self):
75 self.mbox.lock()
76 try:
77 self.initialize_filters()
78 for message in self.mbox: self.apply_filters(message)
79 self.finalize_filters()
80 finally:
81 self.mbox.unlock()
82 self.mbox.close()
83
84 #----------------------------------------------------------------------------#
85
86 class MboxFilter:
87 """Defines an interface for filters."""
88
89 def initialize(self, mbox_file):
90 """Called by the processor before processing starts.
91
92 This is the place to open descriptors required during processing."""
93 pass
94
95 def process(self, message):
96 """Called by the processor for each message that reaches this step.
97
98 Return true to stop processing, and false to go to the next filter."""
99 pass
100
101 def finalize(self):
102 """Called by the processor after processing ends.
103
104 This is the place to display the results and close all descriptors."""
105 pass
106
107 #----------------------------------------------------------------------------#
108
109 def findSubject(message):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message['Subject'] is not None:
112 try:
113 return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
114 except:
115 pass
116 return None
117
118 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
119
120 def findAddressInBounce(bounce):
121 """Finds the faulty email address in a bounced email.
122
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
128 return None
129 # Extract the second component of the multipart/report
130 if len(bounce.get_payload()) < 2:
131 print '! Not a valid bounce (expected at least 2 parts, found %d).' % len(bounce)
132 return None
133 status = bounce.get_payload(1)
134 if status.get_content_type() != 'message/delivery-status':
135 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
136 return None
137 # The per-message-fields don't matter here, get only the per-recipient-fields
138 if len(status.get_payload()) < 2:
139 print '! Not a valid bounce (expected at least 2 parts, found %d).' % len(status)
140 return None
141 content = status.get_payload(1)
142 if content.get_content_type() != 'text/plain':
143 print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
144 return None
145 # Extract the faulty email address
146 recipient_match = _recipient_re.search(content['Final-Recipient'])
147 if recipient_match is None:
148 print '! Missing final recipient.'
149 return None
150 email = recipient_match.group(1)
151 # Check the action field
152 if content['Action'] != 'failed':
153 print '! Not a failed action (%s).' % content['Action']
154 return None
155 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
156 # Otherwise, the first sub-field should indicate a permanent failure
157 postfix_error = content['Diagnostic-Code'] is not None \
158 and content['Diagnostic-Code'].startswith('X-Postfix')
159 if not postfix_error and int(content['Status'][:1]) != 5:
160 print '! Not a permanent failure status (%s).' % content['Status']
161 return None
162 return email
163
164 #----------------------------------------------------------------------------#
165
166 class DirectBouncesFilter(MboxFilter):
167
168 def initialize(self, mbox_file):
169 self.seen = 0
170 self.emails = []
171 self.mbox_file = '%s.bounced' % mbox_file
172 self.mbox = mailbox.mbox(self.mbox_file)
173 self.mbox.clear()
174
175 def process(self, message):
176 if message['X-Spam-Flag'] is None:
177 # During finalization, we will verifiy that all messages were processed
178 self.seen += 1
179 # Additionnal checks, just to be sure
180 if message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
181 or message['Subject'] != 'Undelivered Mail Returned to Sender':
182 return False
183 email = findAddressInBounce(message)
184 if email is not None:
185 self.emails.append(email)
186 self.mbox.add(message)
187 return True
188 return False
189
190 def finalize(self):
191 print 'Found %d messages with no X-Spam-Flag header.' % self.seen
192 print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
193 if self.seen != len(self.mbox):
194 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
195 print 'They were saved in %s.' % self.mbox_file
196 print ''
197 print 'Here is the list of email adresses for these bounces:'
198 print ''
199 for email in self.emails:
200 print email
201 print ''
202 self.mbox.close()
203
204 #----------------------------------------------------------------------------#
205
206 class SpamFilter(MboxFilter):
207
208 def initialize(self, mbox_file):
209 self.mbox_file = '%s.spam' % mbox_file
210 self.mbox = mailbox.mbox(self.mbox_file)
211 self.mbox.clear()
212
213 def process(self, message):
214 if message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
215 self.mbox.add(message)
216 return True
217 return False
218
219 def finalize(self):
220 print 'Found %d spams. This is reliable.' % len(self.mbox)
221 print 'They were saved in %s.' % self.mbox_file
222 print 'You might check the contents of this mbox.'
223 self.mbox.close()
224
225 #----------------------------------------------------------------------------#
226
227 class UnsureFilter(MboxFilter):
228
229 def initialize(self, mbox_file):
230 self.mbox_file = '%s.unsure' % mbox_file
231 self.mbox = mailbox.mbox(self.mbox_file)
232 self.mbox.clear()
233
234 def process(self, message):
235 if message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
236 self.mbox.add(message)
237 return True
238 return False
239
240 def finalize(self):
241 print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
242 print 'They were saved in %s.' % self.mbox_file
243 print 'You must check the contents of this mbox and feed the antispam.'
244 self.mbox.close()
245
246 #----------------------------------------------------------------------------#
247
248 class CheckNonSpamFilter(MboxFilter):
249
250 def initialize(self, mbox_file):
251 self.seen = 0
252
253 def process(self, message):
254 if not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
255 self.seen += 1
256 return False
257
258 def finalize(self):
259 if self.seen > 0:
260 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.counter
261 print 'Please investigate.'
262 else:
263 print 'All messages were either spam, or unsure, or non-spams. Good.'
264
265 #----------------------------------------------------------------------------#
266
267 class OutOfOfficeFilter(MboxFilter):
268
269 def initialize(self, mbox_file):
270 self.mbox_file = '%s.ooo' % mbox_file
271 self.mbox = mailbox.mbox(self.mbox_file)
272 self.mbox.clear()
273 subject_re = [
274 r'^Absen(t|ce)',
275 r'^Out of office',
276 r'est absent',
277 r'is out of (the )?office',
278 u'^RĂ©ponse automatique d\'absence du bureau', # unicode!
279 ]
280 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
281
282 def process(self, message):
283 subject = findSubject(message)
284 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
285 self.mbox.add(message)
286 return True
287 return False
288
289 def finalize(self):
290 print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
291 print 'They were saved in %s.' % self.mbox_file
292 print 'You may check the contents of this mbox.'
293 self.mbox.close()
294
295 #----------------------------------------------------------------------------#
296
297 class DeliveryStatusNotificationFilter(MboxFilter):
298
299 def initialize(self, mbox_file):
300 self.emails = []
301 self.mbox_file = '%s.dsn' % mbox_file
302 self.mbox = mailbox.mbox(self.mbox_file)
303 self.mbox.clear()
304 subject_re = [
305 r'^DELIVERY FAILURE: ',
306 r'^Delivery Notification: Delivery has failed$',
307 r'^Delivery Status Notification ?\(Failure\)$',
308 r'^Mail delivery failed',
309 r'^(Mail revenu en erreur / )?Undelivered Mail Returned to Sender$',
310 r'^Returned mail: see transcript for details$',
311 r'^Undeliverable( mail)?:',
312 r'^Undelivered Mail Returned to Sender$',
313 ]
314 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
315
316 def process(self, message):
317 subject = findSubject(message)
318 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
319 email = findAddressInBounce(message)
320 if email is not None:
321 self.emails.append(email)
322 self.mbox.add(message)
323 return True
324 return False
325
326 def finalize(self):
327 print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
328 print 'They were saved in %s.' % self.mbox_file
329 print ''
330 print 'Here is the list of email adresses for these bounces:'
331 print ''
332 for email in self.emails:
333 print email
334 print ''
335 self.mbox.close()
336
337 #----------------------------------------------------------------------------#
338
339 class CatchAllFilter(MboxFilter):
340
341 def initialize(self, mbox_file):
342 self.mbox_file = '%s.catchall' % mbox_file
343 self.mbox = mailbox.mbox(self.mbox_file)
344 self.mbox.clear()
345
346 def process(self, message):
347 self.mbox.add(message)
348 return True
349
350 def finalize(self):
351 if len(self.mbox) > 0:
352 print '%d messages reached the catchall.' % len(self.mbox)
353 print 'They were saved in %s.' % self.mbox_file
354 print 'You must process the contents of this mbox manually.'
355 self.mbox.close()
356 else:
357 print 'No messages reached the catchall. Nice.'
358 self.mbox.close()
359 os.unlink(self.mbox_file)
360
361 #----------------------------------------------------------------------------#
362
363 if __name__ == '__main__':
364
365 if len(sys.argv) != 2:
366 print 'Usage: %s mbox' % sys.argv[0]
367 sys.exit(1)
368
369 if not os.path.exists(sys.argv[1]):
370 print 'No such file: %s' % sys.argv[1]
371 sys.exit(1)
372
373 processor = MboxProcessor(sys.argv[1])
374 processor.run()