Fixes inconsistencies with register_ext links.
[platal.git] / bin / newsletter.bounces.processor.py
CommitLineData
58e64caf
AA
1#!/usr/bin/env python2.5
2# -*- coding: utf-8 -*-
3#***************************************************************************
5e1513f6 4#* Copyright (C) 2003-2011 Polytechnique.org *
58e64caf
AA
5#* http://opensource.polytechnique.org/ *
6#* *
7#* This program is free software; you can redistribute it and/or modify *
8#* it under the terms of the GNU General Public License as published by *
9#* the Free Software Foundation; either version 2 of the License, or *
10#* (at your option) any later version. *
11#* *
12#* This program is distributed in the hope that it will be useful, *
13#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15#* GNU General Public License for more details. *
16#* *
17#* You should have received a copy of the GNU General Public License *
18#* along with this program; if not, write to the Free Software *
19#* Foundation, Inc., *
20#* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21#***************************************************************************
22
58e64caf
AA
23"""
24Process as automatically as possible bounces from the newsletter
25
26The goal is to extract the email adresses that actually bounced.
27Bounces conforming to RFC 1894 will be automatically processed.
28
29This script uses the X-Spam-Flag header to remove spam and heuristics
30to detect out-of-office auto-replies and delivery status notifications.
31
32All emails are saved in different mailboxes to make human post-processing easier.
33"""
34
35import email, mailbox, os, re, sys, time
36
37#----------------------------------------------------------------------------#
38
39class MboxProcessor:
40 """Applies a series of filters to each message in a mbox."""
41
42 def __init__(self, mbox):
43 self.mbox_file = mbox
44 self.mbox = mailbox.mbox(self.mbox_file)
45 self.filters = [
46 DirectBouncesFilter(),
47 SpamFilter(),
48 UnsureFilter(),
49 CheckNonSpamFilter(),
50 OutOfOfficeFilter(),
51 DeliveryStatusNotificationFilter(),
52 CatchAllFilter()
53 ]
54
55 def initialize_filters(self):
56 for f in self.filters: f.initialize(self.mbox_file)
57 self.start_time = time.clock()
58
59 def apply_filters(self, message):
60 return any(f.process(message) for f in self.filters)
61
62 def finalize_filters(self):
63 duration = time.clock() - self.start_time
64 separator = '-' * 80
65 print separator
66 print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
67 print separator
68 for f in self.filters:
69 f.finalize();
70 print separator
71
72 def run(self):
73 self.mbox.lock()
74 try:
75 self.initialize_filters()
76 for message in self.mbox: self.apply_filters(message)
77 self.finalize_filters()
78 finally:
79 self.mbox.unlock()
80 self.mbox.close()
81
82#----------------------------------------------------------------------------#
83
84class MboxFilter:
85 """Defines an interface for filters."""
86
87 def initialize(self, mbox_file):
88 """Called by the processor before processing starts.
89
90 This is the place to open descriptors required during processing."""
91 pass
92
93 def process(self, message):
94 """Called by the processor for each message that reaches this step.
95
96 Return true to stop processing, and false to go to the next filter."""
97 pass
98
99 def finalize(self):
100 """Called by the processor after processing ends.
101
102 This is the place to display the results and close all descriptors."""
103 pass
104
105#----------------------------------------------------------------------------#
106
107def findSubject(message):
108 """Returns the subject of an email.Message as an unicode string."""
109 if message['Subject'] is not None:
110 try:
111 return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
112 except:
113 pass
114 return None
115
116_recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
117
118def findAddressInBounce(bounce):
119 """Finds the faulty email address in a bounced email.
120
121 See RFC 1894 for more information.
122 Returns None or the email address."""
123 # Check that it is a bounce - a few MTA fail to set this correctly :(
124 if bounce.get_content_type() != 'multipart/report':
125 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
126 return None
127 # Extract the second component of the multipart/report
aa6c6ed4
AA
128 num_payloads = len(bounce.get_payload())
129 if num_payloads < 2:
130 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
58e64caf
AA
131 return None
132 status = bounce.get_payload(1)
133 if status.get_content_type() != 'message/delivery-status':
134 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
135 return None
136 # The per-message-fields don't matter here, get only the per-recipient-fields
aa6c6ed4
AA
137 num_payloads = len(status.get_payload())
138 if num_payloads < 2:
139 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
58e64caf
AA
140 return None
141 content = status.get_payload(1)
142 if content.get_content_type() != 'text/plain':
143 print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
144 return None
145 # Extract the faulty email address
146 recipient_match = _recipient_re.search(content['Final-Recipient'])
147 if recipient_match is None:
148 print '! Missing final recipient.'
149 return None
150 email = recipient_match.group(1)
151 # Check the action field
152 if content['Action'] != 'failed':
153 print '! Not a failed action (%s).' % content['Action']
154 return None
155 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
156 # Otherwise, the first sub-field should indicate a permanent failure
157 postfix_error = content['Diagnostic-Code'] is not None \
158 and content['Diagnostic-Code'].startswith('X-Postfix')
159 if not postfix_error and int(content['Status'][:1]) != 5:
160 print '! Not a permanent failure status (%s).' % content['Status']
161 return None
162 return email
163
164#----------------------------------------------------------------------------#
165
166class DirectBouncesFilter(MboxFilter):
167
168 def initialize(self, mbox_file):
169 self.seen = 0
170 self.emails = []
171 self.mbox_file = '%s.bounced' % mbox_file
172 self.mbox = mailbox.mbox(self.mbox_file)
173 self.mbox.clear()
174
175 def process(self, message):
176 if message['X-Spam-Flag'] is None:
177 # During finalization, we will verifiy that all messages were processed
178 self.seen += 1
e0c82ac8
AA
179 # Special case: ignore mailman notifications for the mailing-list
180 # on which the NL is forwarded
181 if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org':
182 print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.'
183 self.seen -= 1
184 return True
58e64caf 185 # Additionnal checks, just to be sure
e0c82ac8 186 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
58e64caf 187 or message['Subject'] != 'Undelivered Mail Returned to Sender':
e0c82ac8
AA
188 print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject'])
189 else:
190 email = findAddressInBounce(message)
191 if email is not None:
192 self.emails.append(email)
193 self.mbox.add(message)
194 return True
195 else:
196 print '! No email found in direct bounce, this is really bad.'
58e64caf
AA
197 return False
198
199 def finalize(self):
200 print 'Found %d messages with no X-Spam-Flag header.' % self.seen
201 print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
202 if self.seen != len(self.mbox):
203 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
204 print 'They were saved in %s.' % self.mbox_file
205 print ''
206 print 'Here is the list of email adresses for these bounces:'
207 print ''
208 for email in self.emails:
209 print email
210 print ''
211 self.mbox.close()
212
213#----------------------------------------------------------------------------#
214
215class SpamFilter(MboxFilter):
216
217 def initialize(self, mbox_file):
218 self.mbox_file = '%s.spam' % mbox_file
219 self.mbox = mailbox.mbox(self.mbox_file)
220 self.mbox.clear()
221
222 def process(self, message):
e0c82ac8
AA
223 if message['X-Spam-Flag'] is not None \
224 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
58e64caf
AA
225 self.mbox.add(message)
226 return True
227 return False
228
229 def finalize(self):
230 print 'Found %d spams. This is reliable.' % len(self.mbox)
231 print 'They were saved in %s.' % self.mbox_file
232 print 'You might check the contents of this mbox.'
233 self.mbox.close()
234
235#----------------------------------------------------------------------------#
236
237class UnsureFilter(MboxFilter):
238
239 def initialize(self, mbox_file):
240 self.mbox_file = '%s.unsure' % mbox_file
241 self.mbox = mailbox.mbox(self.mbox_file)
242 self.mbox.clear()
243
244 def process(self, message):
e0c82ac8
AA
245 if message['X-Spam-Flag'] is not None \
246 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
58e64caf
AA
247 self.mbox.add(message)
248 return True
249 return False
250
251 def finalize(self):
252 print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
253 print 'They were saved in %s.' % self.mbox_file
254 print 'You must check the contents of this mbox and feed the antispam.'
255 self.mbox.close()
256
257#----------------------------------------------------------------------------#
258
259class CheckNonSpamFilter(MboxFilter):
260
261 def initialize(self, mbox_file):
262 self.seen = 0
263
264 def process(self, message):
e0c82ac8
AA
265 if message['X-Spam-Flag'] is None \
266 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
58e64caf
AA
267 self.seen += 1
268 return False
269
270 def finalize(self):
271 if self.seen > 0:
e0c82ac8 272 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen
58e64caf
AA
273 print 'Please investigate.'
274 else:
275 print 'All messages were either spam, or unsure, or non-spams. Good.'
276
277#----------------------------------------------------------------------------#
278
279class OutOfOfficeFilter(MboxFilter):
280
281 def initialize(self, mbox_file):
282 self.mbox_file = '%s.ooo' % mbox_file
283 self.mbox = mailbox.mbox(self.mbox_file)
284 self.mbox.clear()
285 subject_re = [
286 r'^Absen(t|ce)',
aa6c6ed4
AA
287 r'(est|is) absent',
288 r'^Out of (the )?office',
58e64caf 289 r'is out of (the )?office',
aa6c6ed4
AA
290 r'I am out of town',
291 r'automatique d\'absence',
292 r'Notification d\'absence'
293 u'RĂ©ponse automatique :', #unicode!
294 r'AutoReply',
58e64caf
AA
295 ]
296 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
297
298 def process(self, message):
299 subject = findSubject(message)
300 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
301 self.mbox.add(message)
302 return True
303 return False
304
305 def finalize(self):
306 print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
307 print 'They were saved in %s.' % self.mbox_file
308 print 'You may check the contents of this mbox.'
309 self.mbox.close()
310
311#----------------------------------------------------------------------------#
312
313class DeliveryStatusNotificationFilter(MboxFilter):
314
315 def initialize(self, mbox_file):
316 self.emails = []
317 self.mbox_file = '%s.dsn' % mbox_file
318 self.mbox = mailbox.mbox(self.mbox_file)
319 self.mbox.clear()
58e64caf
AA
320
321 def process(self, message):
aa6c6ed4 322 if message.get_content_type() == 'multipart/report':
58e64caf
AA
323 email = findAddressInBounce(message)
324 if email is not None:
325 self.emails.append(email)
326 self.mbox.add(message)
327 return True
328 return False
329
330 def finalize(self):
331 print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
332 print 'They were saved in %s.' % self.mbox_file
333 print ''
334 print 'Here is the list of email adresses for these bounces:'
335 print ''
336 for email in self.emails:
337 print email
338 print ''
339 self.mbox.close()
340
341#----------------------------------------------------------------------------#
342
343class CatchAllFilter(MboxFilter):
344
345 def initialize(self, mbox_file):
346 self.mbox_file = '%s.catchall' % mbox_file
347 self.mbox = mailbox.mbox(self.mbox_file)
348 self.mbox.clear()
349
350 def process(self, message):
351 self.mbox.add(message)
352 return True
353
354 def finalize(self):
355 if len(self.mbox) > 0:
356 print '%d messages reached the catchall.' % len(self.mbox)
357 print 'They were saved in %s.' % self.mbox_file
358 print 'You must process the contents of this mbox manually.'
359 self.mbox.close()
360 else:
361 print 'No messages reached the catchall. Nice.'
362 self.mbox.close()
363 os.unlink(self.mbox_file)
364
365#----------------------------------------------------------------------------#
366
367if __name__ == '__main__':
368
369 if len(sys.argv) != 2:
370 print 'Usage: %s mbox' % sys.argv[0]
371 sys.exit(1)
372
373 if not os.path.exists(sys.argv[1]):
374 print 'No such file: %s' % sys.argv[1]
375 sys.exit(1)
376
377 processor = MboxProcessor(sys.argv[1])
378 processor.run()