Merge branch 'platal-0.10.2'
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2003-2010 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 """
24 Process as automatically as possible bounces from the newsletter
25
26 The goal is to extract the email adresses that actually bounced.
27 Bounces conforming to RFC 1894 will be automatically processed.
28
29 This script uses the X-Spam-Flag header to remove spam and heuristics
30 to detect out-of-office auto-replies and delivery status notifications.
31
32 All emails are saved in different mailboxes to make human post-processing easier.
33 """
34
35 import email, mailbox, os, re, sys, time
36
37 #----------------------------------------------------------------------------#
38
39 class MboxProcessor:
40 """Applies a series of filters to each message in a mbox."""
41
42 def __init__(self, mbox):
43 self.mbox_file = mbox
44 self.mbox = mailbox.mbox(self.mbox_file)
45 self.filters = [
46 DirectBouncesFilter(),
47 SpamFilter(),
48 UnsureFilter(),
49 CheckNonSpamFilter(),
50 OutOfOfficeFilter(),
51 DeliveryStatusNotificationFilter(),
52 CatchAllFilter()
53 ]
54
55 def initialize_filters(self):
56 for f in self.filters: f.initialize(self.mbox_file)
57 self.start_time = time.clock()
58
59 def apply_filters(self, message):
60 return any(f.process(message) for f in self.filters)
61
62 def finalize_filters(self):
63 duration = time.clock() - self.start_time
64 separator = '-' * 80
65 print separator
66 print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
67 print separator
68 for f in self.filters:
69 f.finalize();
70 print separator
71
72 def run(self):
73 self.mbox.lock()
74 try:
75 self.initialize_filters()
76 for message in self.mbox: self.apply_filters(message)
77 self.finalize_filters()
78 finally:
79 self.mbox.unlock()
80 self.mbox.close()
81
82 #----------------------------------------------------------------------------#
83
84 class MboxFilter:
85 """Defines an interface for filters."""
86
87 def initialize(self, mbox_file):
88 """Called by the processor before processing starts.
89
90 This is the place to open descriptors required during processing."""
91 pass
92
93 def process(self, message):
94 """Called by the processor for each message that reaches this step.
95
96 Return true to stop processing, and false to go to the next filter."""
97 pass
98
99 def finalize(self):
100 """Called by the processor after processing ends.
101
102 This is the place to display the results and close all descriptors."""
103 pass
104
105 #----------------------------------------------------------------------------#
106
107 def findSubject(message):
108 """Returns the subject of an email.Message as an unicode string."""
109 if message['Subject'] is not None:
110 try:
111 return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
112 except:
113 pass
114 return None
115
116 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
117
118 def findAddressInBounce(bounce):
119 """Finds the faulty email address in a bounced email.
120
121 See RFC 1894 for more information.
122 Returns None or the email address."""
123 # Check that it is a bounce - a few MTA fail to set this correctly :(
124 if bounce.get_content_type() != 'multipart/report':
125 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
126 return None
127 # Extract the second component of the multipart/report
128 num_payloads = len(bounce.get_payload())
129 if num_payloads < 2:
130 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
131 return None
132 status = bounce.get_payload(1)
133 if status.get_content_type() != 'message/delivery-status':
134 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
135 return None
136 # The per-message-fields don't matter here, get only the per-recipient-fields
137 num_payloads = len(status.get_payload())
138 if num_payloads < 2:
139 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
140 return None
141 content = status.get_payload(1)
142 if content.get_content_type() != 'text/plain':
143 print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
144 return None
145 # Extract the faulty email address
146 recipient_match = _recipient_re.search(content['Final-Recipient'])
147 if recipient_match is None:
148 print '! Missing final recipient.'
149 return None
150 email = recipient_match.group(1)
151 # Check the action field
152 if content['Action'] != 'failed':
153 print '! Not a failed action (%s).' % content['Action']
154 return None
155 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
156 # Otherwise, the first sub-field should indicate a permanent failure
157 postfix_error = content['Diagnostic-Code'] is not None \
158 and content['Diagnostic-Code'].startswith('X-Postfix')
159 if not postfix_error and int(content['Status'][:1]) != 5:
160 print '! Not a permanent failure status (%s).' % content['Status']
161 return None
162 return email
163
164 #----------------------------------------------------------------------------#
165
166 class DirectBouncesFilter(MboxFilter):
167
168 def initialize(self, mbox_file):
169 self.seen = 0
170 self.emails = []
171 self.mbox_file = '%s.bounced' % mbox_file
172 self.mbox = mailbox.mbox(self.mbox_file)
173 self.mbox.clear()
174
175 def process(self, message):
176 if message['X-Spam-Flag'] is None:
177 # During finalization, we will verifiy that all messages were processed
178 self.seen += 1
179 # Special case: ignore mailman notifications for the mailing-list
180 # on which the NL is forwarded
181 if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org':
182 print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.'
183 self.seen -= 1
184 return True
185 # Additionnal checks, just to be sure
186 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
187 or message['Subject'] != 'Undelivered Mail Returned to Sender':
188 print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject'])
189 else:
190 email = findAddressInBounce(message)
191 if email is not None:
192 self.emails.append(email)
193 self.mbox.add(message)
194 return True
195 else:
196 print '! No email found in direct bounce, this is really bad.'
197 return False
198
199 def finalize(self):
200 print 'Found %d messages with no X-Spam-Flag header.' % self.seen
201 print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
202 if self.seen != len(self.mbox):
203 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
204 print 'They were saved in %s.' % self.mbox_file
205 print ''
206 print 'Here is the list of email adresses for these bounces:'
207 print ''
208 for email in self.emails:
209 print email
210 print ''
211 self.mbox.close()
212
213 #----------------------------------------------------------------------------#
214
215 class SpamFilter(MboxFilter):
216
217 def initialize(self, mbox_file):
218 self.mbox_file = '%s.spam' % mbox_file
219 self.mbox = mailbox.mbox(self.mbox_file)
220 self.mbox.clear()
221
222 def process(self, message):
223 if message['X-Spam-Flag'] is not None \
224 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
225 self.mbox.add(message)
226 return True
227 return False
228
229 def finalize(self):
230 print 'Found %d spams. This is reliable.' % len(self.mbox)
231 print 'They were saved in %s.' % self.mbox_file
232 print 'You might check the contents of this mbox.'
233 self.mbox.close()
234
235 #----------------------------------------------------------------------------#
236
237 class UnsureFilter(MboxFilter):
238
239 def initialize(self, mbox_file):
240 self.mbox_file = '%s.unsure' % mbox_file
241 self.mbox = mailbox.mbox(self.mbox_file)
242 self.mbox.clear()
243
244 def process(self, message):
245 if message['X-Spam-Flag'] is not None \
246 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
247 self.mbox.add(message)
248 return True
249 return False
250
251 def finalize(self):
252 print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
253 print 'They were saved in %s.' % self.mbox_file
254 print 'You must check the contents of this mbox and feed the antispam.'
255 self.mbox.close()
256
257 #----------------------------------------------------------------------------#
258
259 class CheckNonSpamFilter(MboxFilter):
260
261 def initialize(self, mbox_file):
262 self.seen = 0
263
264 def process(self, message):
265 if message['X-Spam-Flag'] is None \
266 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
267 self.seen += 1
268 return False
269
270 def finalize(self):
271 if self.seen > 0:
272 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen
273 print 'Please investigate.'
274 else:
275 print 'All messages were either spam, or unsure, or non-spams. Good.'
276
277 #----------------------------------------------------------------------------#
278
279 class OutOfOfficeFilter(MboxFilter):
280
281 def initialize(self, mbox_file):
282 self.mbox_file = '%s.ooo' % mbox_file
283 self.mbox = mailbox.mbox(self.mbox_file)
284 self.mbox.clear()
285 subject_re = [
286 r'^Absen(t|ce)',
287 r'(est|is) absent',
288 r'^Out of (the )?office',
289 r'is out of (the )?office',
290 r'I am out of town',
291 r'automatique d\'absence',
292 r'Notification d\'absence'
293 u'RĂ©ponse automatique :', #unicode!
294 r'AutoReply',
295 ]
296 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
297
298 def process(self, message):
299 subject = findSubject(message)
300 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
301 self.mbox.add(message)
302 return True
303 return False
304
305 def finalize(self):
306 print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
307 print 'They were saved in %s.' % self.mbox_file
308 print 'You may check the contents of this mbox.'
309 self.mbox.close()
310
311 #----------------------------------------------------------------------------#
312
313 class DeliveryStatusNotificationFilter(MboxFilter):
314
315 def initialize(self, mbox_file):
316 self.emails = []
317 self.mbox_file = '%s.dsn' % mbox_file
318 self.mbox = mailbox.mbox(self.mbox_file)
319 self.mbox.clear()
320
321 def process(self, message):
322 if message.get_content_type() == 'multipart/report':
323 email = findAddressInBounce(message)
324 if email is not None:
325 self.emails.append(email)
326 self.mbox.add(message)
327 return True
328 return False
329
330 def finalize(self):
331 print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
332 print 'They were saved in %s.' % self.mbox_file
333 print ''
334 print 'Here is the list of email adresses for these bounces:'
335 print ''
336 for email in self.emails:
337 print email
338 print ''
339 self.mbox.close()
340
341 #----------------------------------------------------------------------------#
342
343 class CatchAllFilter(MboxFilter):
344
345 def initialize(self, mbox_file):
346 self.mbox_file = '%s.catchall' % mbox_file
347 self.mbox = mailbox.mbox(self.mbox_file)
348 self.mbox.clear()
349
350 def process(self, message):
351 self.mbox.add(message)
352 return True
353
354 def finalize(self):
355 if len(self.mbox) > 0:
356 print '%d messages reached the catchall.' % len(self.mbox)
357 print 'They were saved in %s.' % self.mbox_file
358 print 'You must process the contents of this mbox manually.'
359 self.mbox.close()
360 else:
361 print 'No messages reached the catchall. Nice.'
362 self.mbox.close()
363 os.unlink(self.mbox_file)
364
365 #----------------------------------------------------------------------------#
366
367 if __name__ == '__main__':
368
369 if len(sys.argv) != 2:
370 print 'Usage: %s mbox' % sys.argv[0]
371 sys.exit(1)
372
373 if not os.path.exists(sys.argv[1]):
374 print 'No such file: %s' % sys.argv[1]
375 sys.exit(1)
376
377 processor = MboxProcessor(sys.argv[1])
378 processor.run()