Merge commit 'origin/master' into fusionax
[platal.git] / bin / newsletter.bounces.processor.py
1 #!/usr/bin/env python2.5
2 # -*- coding: utf-8 -*-
3 #***************************************************************************
4 #* Copyright (C) 2004-2008 polytechnique.org *
5 #* http://opensource.polytechnique.org/ *
6 #* *
7 #* This program is free software; you can redistribute it and/or modify *
8 #* it under the terms of the GNU General Public License as published by *
9 #* the Free Software Foundation; either version 2 of the License, or *
10 #* (at your option) any later version. *
11 #* *
12 #* This program is distributed in the hope that it will be useful, *
13 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 #* GNU General Public License for more details. *
16 #* *
17 #* You should have received a copy of the GNU General Public License *
18 #* along with this program; if not, write to the Free Software *
19 #* Foundation, Inc., *
20 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
21 #***************************************************************************
22
23 # Copyright (c) 2008 Aymeric Augustin
24
25 """
26 Process as automatically as possible bounces from the newsletter
27
28 The goal is to extract the email adresses that actually bounced.
29 Bounces conforming to RFC 1894 will be automatically processed.
30
31 This script uses the X-Spam-Flag header to remove spam and heuristics
32 to detect out-of-office auto-replies and delivery status notifications.
33
34 All emails are saved in different mailboxes to make human post-processing easier.
35 """
36
37 import email, mailbox, os, re, sys, time
38
39 #----------------------------------------------------------------------------#
40
41 class MboxProcessor:
42 """Applies a series of filters to each message in a mbox."""
43
44 def __init__(self, mbox):
45 self.mbox_file = mbox
46 self.mbox = mailbox.mbox(self.mbox_file)
47 self.filters = [
48 DirectBouncesFilter(),
49 SpamFilter(),
50 UnsureFilter(),
51 CheckNonSpamFilter(),
52 OutOfOfficeFilter(),
53 DeliveryStatusNotificationFilter(),
54 CatchAllFilter()
55 ]
56
57 def initialize_filters(self):
58 for f in self.filters: f.initialize(self.mbox_file)
59 self.start_time = time.clock()
60
61 def apply_filters(self, message):
62 return any(f.process(message) for f in self.filters)
63
64 def finalize_filters(self):
65 duration = time.clock() - self.start_time
66 separator = '-' * 80
67 print separator
68 print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)
69 print separator
70 for f in self.filters:
71 f.finalize();
72 print separator
73
74 def run(self):
75 self.mbox.lock()
76 try:
77 self.initialize_filters()
78 for message in self.mbox: self.apply_filters(message)
79 self.finalize_filters()
80 finally:
81 self.mbox.unlock()
82 self.mbox.close()
83
84 #----------------------------------------------------------------------------#
85
86 class MboxFilter:
87 """Defines an interface for filters."""
88
89 def initialize(self, mbox_file):
90 """Called by the processor before processing starts.
91
92 This is the place to open descriptors required during processing."""
93 pass
94
95 def process(self, message):
96 """Called by the processor for each message that reaches this step.
97
98 Return true to stop processing, and false to go to the next filter."""
99 pass
100
101 def finalize(self):
102 """Called by the processor after processing ends.
103
104 This is the place to display the results and close all descriptors."""
105 pass
106
107 #----------------------------------------------------------------------------#
108
109 def findSubject(message):
110 """Returns the subject of an email.Message as an unicode string."""
111 if message['Subject'] is not None:
112 try:
113 return unicode(email.header.make_header(email.header.decode_header(message['Subject'])))
114 except:
115 pass
116 return None
117
118 _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U)
119
120 def findAddressInBounce(bounce):
121 """Finds the faulty email address in a bounced email.
122
123 See RFC 1894 for more information.
124 Returns None or the email address."""
125 # Check that it is a bounce - a few MTA fail to set this correctly :(
126 if bounce.get_content_type() != 'multipart/report':
127 print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()
128 return None
129 # Extract the second component of the multipart/report
130 num_payloads = len(bounce.get_payload())
131 if num_payloads < 2:
132 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
133 return None
134 status = bounce.get_payload(1)
135 if status.get_content_type() != 'message/delivery-status':
136 print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type()
137 return None
138 # The per-message-fields don't matter here, get only the per-recipient-fields
139 num_payloads = len(status.get_payload())
140 if num_payloads < 2:
141 print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads
142 return None
143 content = status.get_payload(1)
144 if content.get_content_type() != 'text/plain':
145 print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type
146 return None
147 # Extract the faulty email address
148 recipient_match = _recipient_re.search(content['Final-Recipient'])
149 if recipient_match is None:
150 print '! Missing final recipient.'
151 return None
152 email = recipient_match.group(1)
153 # Check the action field
154 if content['Action'] != 'failed':
155 print '! Not a failed action (%s).' % content['Action']
156 return None
157 # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors
158 # Otherwise, the first sub-field should indicate a permanent failure
159 postfix_error = content['Diagnostic-Code'] is not None \
160 and content['Diagnostic-Code'].startswith('X-Postfix')
161 if not postfix_error and int(content['Status'][:1]) != 5:
162 print '! Not a permanent failure status (%s).' % content['Status']
163 return None
164 return email
165
166 #----------------------------------------------------------------------------#
167
168 class DirectBouncesFilter(MboxFilter):
169
170 def initialize(self, mbox_file):
171 self.seen = 0
172 self.emails = []
173 self.mbox_file = '%s.bounced' % mbox_file
174 self.mbox = mailbox.mbox(self.mbox_file)
175 self.mbox.clear()
176
177 def process(self, message):
178 if message['X-Spam-Flag'] is None:
179 # During finalization, we will verifiy that all messages were processed
180 self.seen += 1
181 # Additionnal checks, just to be sure
182 if message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \
183 or message['Subject'] != 'Undelivered Mail Returned to Sender':
184 return False
185 email = findAddressInBounce(message)
186 if email is not None:
187 self.emails.append(email)
188 self.mbox.add(message)
189 return True
190 return False
191
192 def finalize(self):
193 print 'Found %d messages with no X-Spam-Flag header.' % self.seen
194 print 'Found %d of them that are confirmed bounces.' % len(self.mbox)
195 if self.seen != len(self.mbox):
196 print ' /!\ These numbers shoud be equal! We have a problem! /!\\'
197 print 'They were saved in %s.' % self.mbox_file
198 print ''
199 print 'Here is the list of email adresses for these bounces:'
200 print ''
201 for email in self.emails:
202 print email
203 print ''
204 self.mbox.close()
205
206 #----------------------------------------------------------------------------#
207
208 class SpamFilter(MboxFilter):
209
210 def initialize(self, mbox_file):
211 self.mbox_file = '%s.spam' % mbox_file
212 self.mbox = mailbox.mbox(self.mbox_file)
213 self.mbox.clear()
214
215 def process(self, message):
216 if message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'):
217 self.mbox.add(message)
218 return True
219 return False
220
221 def finalize(self):
222 print 'Found %d spams. This is reliable.' % len(self.mbox)
223 print 'They were saved in %s.' % self.mbox_file
224 print 'You might check the contents of this mbox.'
225 self.mbox.close()
226
227 #----------------------------------------------------------------------------#
228
229 class UnsureFilter(MboxFilter):
230
231 def initialize(self, mbox_file):
232 self.mbox_file = '%s.unsure' % mbox_file
233 self.mbox = mailbox.mbox(self.mbox_file)
234 self.mbox.clear()
235
236 def process(self, message):
237 if message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'):
238 self.mbox.add(message)
239 return True
240 return False
241
242 def finalize(self):
243 print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)
244 print 'They were saved in %s.' % self.mbox_file
245 print 'You must check the contents of this mbox and feed the antispam.'
246 self.mbox.close()
247
248 #----------------------------------------------------------------------------#
249
250 class CheckNonSpamFilter(MboxFilter):
251
252 def initialize(self, mbox_file):
253 self.seen = 0
254
255 def process(self, message):
256 if not message['X-Spam-Flag'].startswith('No, tests=bogofilter'):
257 self.seen += 1
258 return False
259
260 def finalize(self):
261 if self.seen > 0:
262 print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.counter
263 print 'Please investigate.'
264 else:
265 print 'All messages were either spam, or unsure, or non-spams. Good.'
266
267 #----------------------------------------------------------------------------#
268
269 class OutOfOfficeFilter(MboxFilter):
270
271 def initialize(self, mbox_file):
272 self.mbox_file = '%s.ooo' % mbox_file
273 self.mbox = mailbox.mbox(self.mbox_file)
274 self.mbox.clear()
275 subject_re = [
276 r'^Absen(t|ce)',
277 r'(est|is) absent',
278 r'^Out of (the )?office',
279 r'is out of (the )?office',
280 r'I am out of town',
281 r'automatique d\'absence',
282 r'Notification d\'absence'
283 u'RĂ©ponse automatique :', #unicode!
284 r'AutoReply',
285 ]
286 self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re))
287
288 def process(self, message):
289 subject = findSubject(message)
290 if subject is not None and any(regex.search(subject) for regex in self.subject_regexes):
291 self.mbox.add(message)
292 return True
293 return False
294
295 def finalize(self):
296 print 'Found %d "out of office". This is generally reliable.' % len(self.mbox)
297 print 'They were saved in %s.' % self.mbox_file
298 print 'You may check the contents of this mbox.'
299 self.mbox.close()
300
301 #----------------------------------------------------------------------------#
302
303 class DeliveryStatusNotificationFilter(MboxFilter):
304
305 def initialize(self, mbox_file):
306 self.emails = []
307 self.mbox_file = '%s.dsn' % mbox_file
308 self.mbox = mailbox.mbox(self.mbox_file)
309 self.mbox.clear()
310
311 def process(self, message):
312 if message.get_content_type() == 'multipart/report':
313 email = findAddressInBounce(message)
314 if email is not None:
315 self.emails.append(email)
316 self.mbox.add(message)
317 return True
318 return False
319
320 def finalize(self):
321 print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)
322 print 'They were saved in %s.' % self.mbox_file
323 print ''
324 print 'Here is the list of email adresses for these bounces:'
325 print ''
326 for email in self.emails:
327 print email
328 print ''
329 self.mbox.close()
330
331 #----------------------------------------------------------------------------#
332
333 class CatchAllFilter(MboxFilter):
334
335 def initialize(self, mbox_file):
336 self.mbox_file = '%s.catchall' % mbox_file
337 self.mbox = mailbox.mbox(self.mbox_file)
338 self.mbox.clear()
339
340 def process(self, message):
341 self.mbox.add(message)
342 return True
343
344 def finalize(self):
345 if len(self.mbox) > 0:
346 print '%d messages reached the catchall.' % len(self.mbox)
347 print 'They were saved in %s.' % self.mbox_file
348 print 'You must process the contents of this mbox manually.'
349 self.mbox.close()
350 else:
351 print 'No messages reached the catchall. Nice.'
352 self.mbox.close()
353 os.unlink(self.mbox_file)
354
355 #----------------------------------------------------------------------------#
356
357 if __name__ == '__main__':
358
359 if len(sys.argv) != 2:
360 print 'Usage: %s mbox' % sys.argv[0]
361 sys.exit(1)
362
363 if not os.path.exists(sys.argv[1]):
364 print 'No such file: %s' % sys.argv[1]
365 sys.exit(1)
366
367 processor = MboxProcessor(sys.argv[1])
368 processor.run()