Merge branch 'platal-0.10.0'
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2009 Polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 # Copyright (c) 2008 Aymeric Augustin
24
25 """
26 Process as automatically as possible bounces from the newsletter
27
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
30
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
33
34 All emails are saved in different mailboxes to make human post-processing easier.
35 """
36
37 import email, mailbox, os, re, sys, time
38
39 #----------------------------------------------------------------------------#
40
41 class MboxProcessor:
42 """Applies a series of filters to each message in a mbox."""
43
44 def __init__(self, mbox):
45 self.mbox_file = mbox
46 self.mbox = mailbox.mbox(self.mbox_file)
47 self.filters = [
48 DirectBouncesFilter(),
49 SpamFilter(),
50 UnsureFilter(),
51 CheckNonSpamFilter(),
52 OutOfOfficeFilter(),
53 DeliveryStatusNotificationFilter(),
54 CatchAllFilter()
55 ]
56
57 def initialize_filters(self):
58 for f in self.filters: f.initialize(self.mbox_file)
59 self.start_time = time.clock()
60
61 def apply_filters(self, message):
62 return any(f.process(message) for f in self.filters)
63
64 def finalize_filters(self):
65 duration = time.clock() - self.start_time
66 separator = '-' * 80
67 print separator
68 print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
69 print separator
70 for f in self.filters:
71 f.finalize();
72 print separator
73
74 def run(self):
75 self.mbox.lock()
76 try:
77 self.initialize_filters()
78 for message in self.mbox: self.apply_filters(message)
79 self.finalize_filters()
80 finally:
81 self.mbox.unlock()
82 self.mbox.close()
83
84 #----------------------------------------------------------------------------#
85
86 class MboxFilter:
87 """Defines an interface for filters."""
88
89 def initialize(self, mbox_file):
90 """Called by the processor before processing starts.
91
92 This is the place to open descriptors required during processing."""
93 pass
94
95 def process(self, message):
96 """Called by the processor for each message that reaches this step.
97
98 Return true to stop processing, and false to go to the next filter."""
99 pass
100
101 def finalize(self):
102 """Called by the processor after processing ends.
103
104 This is the place to display the results and close all descriptors."""
105 pass
106
107 #----------------------------------------------------------------------------#
108
109 def findSubject(message):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message['Subject'] is not None:
112 try:
113 return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
114 except:
115 pass
116 return None
117
118 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
119
120 def findAddressInBounce(bounce):
121 """Finds the faulty email address in a bounced email.
122
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
128 return None
129 # Extract the second component of the multipart/report
130 num_payloads = len(bounce.get_payload())
131 if num_payloads < 2:
132 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
133 return None
134 status = bounce.get_payload(1)
135 if status.get_content_type() != 'message/delivery-status':
136 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
137 return None
138 # The per-message-fields don't matter here, get only the per-recipient-fields
139 num_payloads = len(status.get_payload())
140 if num_payloads < 2:
141 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
142 return None
143 content = status.get_payload(1)
144 if content.get_content_type() != 'text/plain':
145 print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
146 return None
147 # Extract the faulty email address
148 recipient_match = _recipient_re.search(content['Final-Recipient'])
149 if recipient_match is None:
150 print '! Missing final recipient.'
151 return None
152 email = recipient_match.group(1)
153 # Check the action field
154 if content['Action'] != 'failed':
155 print '! Not a failed action (%s).' % content['Action']
156 return None
157 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
158 # Otherwise, the first sub-field should indicate a permanent failure
159 postfix_error = content['Diagnostic-Code'] is not None \
160 and content['Diagnostic-Code'].startswith('X-Postfix')
161 if not postfix_error and int(content['Status'][:1]) != 5:
162 print '! Not a permanent failure status (%s).' % content['Status']
163 return None
164 return email
165
166 #----------------------------------------------------------------------------#
167
168 class DirectBouncesFilter(MboxFilter):
169
170 def initialize(self, mbox_file):
171 self.seen = 0
172 self.emails = []
173 self.mbox_file = '%s.bounced' % mbox_file
174 self.mbox = mailbox.mbox(self.mbox_file)
175 self.mbox.clear()
176
177 def process(self, message):
178 if message['X-Spam-Flag'] is None:
179 # During finalization, we will verifiy that all messages were processed
180 self.seen += 1
181 # Special case: ignore mailman notifications for the mailing-list
182 # on which the NL is forwarded
183 if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org':
184 print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.'
185 self.seen -= 1
186 return True
187 # Additionnal checks, just to be sure
188 elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
189 or message['Subject'] != 'Undelivered Mail Returned to Sender':
190 print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject'])
191 else:
192 email = findAddressInBounce(message)
193 if email is not None:
194 self.emails.append(email)
195 self.mbox.add(message)
196 return True
197 else:
198 print '! No email found in direct bounce, this is really bad.'
199 return False
200
201 def finalize(self):
202 print 'Found %d messages with no X-Spam-Flag header.' % self.seen
203 print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
204 if self.seen != len(self.mbox):
205 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
206 print 'They were saved in %s.' % self.mbox_file
207 print ''
208 print 'Here is the list of email adresses for these bounces:'
209 print ''
210 for email in self.emails:
211 print email
212 print ''
213 self.mbox.close()
214
215 #----------------------------------------------------------------------------#
216
217 class SpamFilter(MboxFilter):
218
219 def initialize(self, mbox_file):
220 self.mbox_file = '%s.spam' % mbox_file
221 self.mbox = mailbox.mbox(self.mbox_file)
222 self.mbox.clear()
223
224 def process(self, message):
225 if message['X-Spam-Flag'] is not None \
226 and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
227 self.mbox.add(message)
228 return True
229 return False
230
231 def finalize(self):
232 print 'Found %d spams. This is reliable.' % len(self.mbox)
233 print 'They were saved in %s.' % self.mbox_file
234 print 'You might check the contents of this mbox.'
235 self.mbox.close()
236
237 #----------------------------------------------------------------------------#
238
239 class UnsureFilter(MboxFilter):
240
241 def initialize(self, mbox_file):
242 self.mbox_file = '%s.unsure' % mbox_file
243 self.mbox = mailbox.mbox(self.mbox_file)
244 self.mbox.clear()
245
246 def process(self, message):
247 if message['X-Spam-Flag'] is not None \
248 and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
249 self.mbox.add(message)
250 return True
251 return False
252
253 def finalize(self):
254 print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
255 print 'They were saved in %s.' % self.mbox_file
256 print 'You must check the contents of this mbox and feed the antispam.'
257 self.mbox.close()
258
259 #----------------------------------------------------------------------------#
260
261 class CheckNonSpamFilter(MboxFilter):
262
263 def initialize(self, mbox_file):
264 self.seen = 0
265
266 def process(self, message):
267 if message['X-Spam-Flag'] is None \
268 or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
269 self.seen += 1
270 return False
271
272 def finalize(self):
273 if self.seen > 0:
274 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen
275 print 'Please investigate.'
276 else:
277 print 'All messages were either spam, or unsure, or non-spams. Good.'
278
279 #----------------------------------------------------------------------------#
280
281 class OutOfOfficeFilter(MboxFilter):
282
283 def initialize(self, mbox_file):
284 self.mbox_file = '%s.ooo' % mbox_file
285 self.mbox = mailbox.mbox(self.mbox_file)
286 self.mbox.clear()
287 subject_re = [
288 r'^Absen(t|ce)',
289 r'(est|is) absent',
290 r'^Out of (the )?office',
291 r'is out of (the )?office',
292 r'I am out of town',
293 r'automatique d\'absence',
294 r'Notification d\'absence'
295 u'RĂ©ponse automatique :', #unicode!
296 r'AutoReply',
297 ]
298 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
299
300 def process(self, message):
301 subject = findSubject(message)
302 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
303 self.mbox.add(message)
304 return True
305 return False
306
307 def finalize(self):
308 print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
309 print 'They were saved in %s.' % self.mbox_file
310 print 'You may check the contents of this mbox.'
311 self.mbox.close()
312
313 #----------------------------------------------------------------------------#
314
315 class DeliveryStatusNotificationFilter(MboxFilter):
316
317 def initialize(self, mbox_file):
318 self.emails = []
319 self.mbox_file = '%s.dsn' % mbox_file
320 self.mbox = mailbox.mbox(self.mbox_file)
321 self.mbox.clear()
322
323 def process(self, message):
324 if message.get_content_type() == 'multipart/report':
325 email = findAddressInBounce(message)
326 if email is not None:
327 self.emails.append(email)
328 self.mbox.add(message)
329 return True
330 return False
331
332 def finalize(self):
333 print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
334 print 'They were saved in %s.' % self.mbox_file
335 print ''
336 print 'Here is the list of email adresses for these bounces:'
337 print ''
338 for email in self.emails:
339 print email
340 print ''
341 self.mbox.close()
342
343 #----------------------------------------------------------------------------#
344
345 class CatchAllFilter(MboxFilter):
346
347 def initialize(self, mbox_file):
348 self.mbox_file = '%s.catchall' % mbox_file
349 self.mbox = mailbox.mbox(self.mbox_file)
350 self.mbox.clear()
351
352 def process(self, message):
353 self.mbox.add(message)
354 return True
355
356 def finalize(self):
357 if len(self.mbox) > 0:
358 print '%d messages reached the catchall.' % len(self.mbox)
359 print 'They were saved in %s.' % self.mbox_file
360 print 'You must process the contents of this mbox manually.'
361 self.mbox.close()
362 else:
363 print 'No messages reached the catchall. Nice.'
364 self.mbox.close()
365 os.unlink(self.mbox_file)
366
367 #----------------------------------------------------------------------------#
368
369 if __name__ == '__main__':
370
371 if len(sys.argv) != 2:
372 print 'Usage: %s mbox' % sys.argv[0]
373 sys.exit(1)
374
375 if not os.path.exists(sys.argv[1]):
376 print 'No such file: %s' % sys.argv[1]
377 sys.exit(1)
378
379 processor = MboxProcessor(sys.argv[1])
380 processor.run()