Commit | Line | Data |
---|---|---|
58e64caf AA |
1 | #!/usr/bin/env python2.5 |
2 | # -*- coding: utf-8 -*- | |
3 | #*************************************************************************** | |
9f5bd98e | 4 | #* Copyright (C) 2003-2010 Polytechnique.org * |
58e64caf AA |
5 | #* http://opensource.polytechnique.org/ * |
6 | #* * | |
7 | #* This program is free software; you can redistribute it and/or modify * | |
8 | #* it under the terms of the GNU General Public License as published by * | |
9 | #* the Free Software Foundation; either version 2 of the License, or * | |
10 | #* (at your option) any later version. * | |
11 | #* * | |
12 | #* This program is distributed in the hope that it will be useful, * | |
13 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * | |
14 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * | |
15 | #* GNU General Public License for more details. * | |
16 | #* * | |
17 | #* You should have received a copy of the GNU General Public License * | |
18 | #* along with this program; if not, write to the Free Software * | |
19 | #* Foundation, Inc., * | |
20 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * | |
21 | #*************************************************************************** | |
22 | ||
58e64caf AA |
23 | """ |
24 | Process as automatically as possible bounces from the newsletter | |
25 | ||
26 | The goal is to extract the email adresses that actually bounced. | |
27 | Bounces conforming to RFC 1894 will be automatically processed. | |
28 | ||
29 | This script uses the X-Spam-Flag header to remove spam and heuristics | |
30 | to detect out-of-office auto-replies and delivery status notifications. | |
31 | ||
32 | All emails are saved in different mailboxes to make human post-processing easier. | |
33 | """ | |
34 | ||
35 | import email, mailbox, os, re, sys, time | |
36 | ||
37 | #----------------------------------------------------------------------------# | |
38 | ||
39 | class MboxProcessor: | |
40 | """Applies a series of filters to each message in a mbox.""" | |
41 | ||
42 | def __init__(self, mbox): | |
43 | self.mbox_file = mbox | |
44 | self.mbox = mailbox.mbox(self.mbox_file) | |
45 | self.filters = [ | |
46 | DirectBouncesFilter(), | |
47 | SpamFilter(), | |
48 | UnsureFilter(), | |
49 | CheckNonSpamFilter(), | |
50 | OutOfOfficeFilter(), | |
51 | DeliveryStatusNotificationFilter(), | |
52 | CatchAllFilter() | |
53 | ] | |
54 | ||
55 | def initialize_filters(self): | |
56 | for f in self.filters: f.initialize(self.mbox_file) | |
57 | self.start_time = time.clock() | |
58 | ||
59 | def apply_filters(self, message): | |
60 | return any(f.process(message) for f in self.filters) | |
61 | ||
62 | def finalize_filters(self): | |
63 | duration = time.clock() - self.start_time | |
64 | separator = '-' * 80 | |
65 | print separator | |
66 | print 'Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration) | |
67 | print separator | |
68 | for f in self.filters: | |
69 | f.finalize(); | |
70 | print separator | |
71 | ||
72 | def run(self): | |
73 | self.mbox.lock() | |
74 | try: | |
75 | self.initialize_filters() | |
76 | for message in self.mbox: self.apply_filters(message) | |
77 | self.finalize_filters() | |
78 | finally: | |
79 | self.mbox.unlock() | |
80 | self.mbox.close() | |
81 | ||
82 | #----------------------------------------------------------------------------# | |
83 | ||
84 | class MboxFilter: | |
85 | """Defines an interface for filters.""" | |
86 | ||
87 | def initialize(self, mbox_file): | |
88 | """Called by the processor before processing starts. | |
89 | ||
90 | This is the place to open descriptors required during processing.""" | |
91 | pass | |
92 | ||
93 | def process(self, message): | |
94 | """Called by the processor for each message that reaches this step. | |
95 | ||
96 | Return true to stop processing, and false to go to the next filter.""" | |
97 | pass | |
98 | ||
99 | def finalize(self): | |
100 | """Called by the processor after processing ends. | |
101 | ||
102 | This is the place to display the results and close all descriptors.""" | |
103 | pass | |
104 | ||
105 | #----------------------------------------------------------------------------# | |
106 | ||
107 | def findSubject(message): | |
108 | """Returns the subject of an email.Message as an unicode string.""" | |
109 | if message['Subject'] is not None: | |
110 | try: | |
111 | return unicode(email.header.make_header(email.header.decode_header(message['Subject']))) | |
112 | except: | |
113 | pass | |
114 | return None | |
115 | ||
116 | _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U) | |
117 | ||
118 | def findAddressInBounce(bounce): | |
119 | """Finds the faulty email address in a bounced email. | |
120 | ||
121 | See RFC 1894 for more information. | |
122 | Returns None or the email address.""" | |
123 | # Check that it is a bounce - a few MTA fail to set this correctly :( | |
124 | if bounce.get_content_type() != 'multipart/report': | |
125 | print '! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type() | |
126 | return None | |
127 | # Extract the second component of the multipart/report | |
aa6c6ed4 AA |
128 | num_payloads = len(bounce.get_payload()) |
129 | if num_payloads < 2: | |
130 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads | |
58e64caf AA |
131 | return None |
132 | status = bounce.get_payload(1) | |
133 | if status.get_content_type() != 'message/delivery-status': | |
134 | print '! Not a valid bounce (expected message/delivery-status, found %s).' % bounce.get_content_type() | |
135 | return None | |
136 | # The per-message-fields don't matter here, get only the per-recipient-fields | |
aa6c6ed4 AA |
137 | num_payloads = len(status.get_payload()) |
138 | if num_payloads < 2: | |
139 | print '! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads | |
58e64caf AA |
140 | return None |
141 | content = status.get_payload(1) | |
142 | if content.get_content_type() != 'text/plain': | |
143 | print '! Not a valid bounce (expected text/plain, found %s).' % bounce.get_content_type | |
144 | return None | |
145 | # Extract the faulty email address | |
146 | recipient_match = _recipient_re.search(content['Final-Recipient']) | |
147 | if recipient_match is None: | |
148 | print '! Missing final recipient.' | |
149 | return None | |
150 | email = recipient_match.group(1) | |
151 | # Check the action field | |
152 | if content['Action'] != 'failed': | |
153 | print '! Not a failed action (%s).' % content['Action'] | |
154 | return None | |
155 | # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors | |
156 | # Otherwise, the first sub-field should indicate a permanent failure | |
157 | postfix_error = content['Diagnostic-Code'] is not None \ | |
158 | and content['Diagnostic-Code'].startswith('X-Postfix') | |
159 | if not postfix_error and int(content['Status'][:1]) != 5: | |
160 | print '! Not a permanent failure status (%s).' % content['Status'] | |
161 | return None | |
162 | return email | |
163 | ||
164 | #----------------------------------------------------------------------------# | |
165 | ||
166 | class DirectBouncesFilter(MboxFilter): | |
167 | ||
168 | def initialize(self, mbox_file): | |
169 | self.seen = 0 | |
170 | self.emails = [] | |
171 | self.mbox_file = '%s.bounced' % mbox_file | |
172 | self.mbox = mailbox.mbox(self.mbox_file) | |
173 | self.mbox.clear() | |
174 | ||
175 | def process(self, message): | |
176 | if message['X-Spam-Flag'] is None: | |
177 | # During finalization, we will verifiy that all messages were processed | |
178 | self.seen += 1 | |
e0c82ac8 AA |
179 | # Special case: ignore mailman notifications for the mailing-list |
180 | # on which the NL is forwarded | |
181 | if message['From'] == 'polytechnique.org_newsletter-externes-bounces@listes.polytechnique.org': | |
182 | print '! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.' | |
183 | self.seen -= 1 | |
184 | return True | |
58e64caf | 185 | # Additionnal checks, just to be sure |
e0c82ac8 | 186 | elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \ |
58e64caf | 187 | or message['Subject'] != 'Undelivered Mail Returned to Sender': |
e0c82ac8 AA |
188 | print '! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject']) |
189 | else: | |
190 | email = findAddressInBounce(message) | |
191 | if email is not None: | |
192 | self.emails.append(email) | |
193 | self.mbox.add(message) | |
194 | return True | |
195 | else: | |
196 | print '! No email found in direct bounce, this is really bad.' | |
58e64caf AA |
197 | return False |
198 | ||
199 | def finalize(self): | |
200 | print 'Found %d messages with no X-Spam-Flag header.' % self.seen | |
201 | print 'Found %d of them that are confirmed bounces.' % len(self.mbox) | |
202 | if self.seen != len(self.mbox): | |
203 | print ' /!\ These numbers shoud be equal! We have a problem! /!\\' | |
204 | print 'They were saved in %s.' % self.mbox_file | |
205 | print '' | |
206 | print 'Here is the list of email adresses for these bounces:' | |
207 | print '' | |
208 | for email in self.emails: | |
209 | print email | |
210 | print '' | |
211 | self.mbox.close() | |
212 | ||
213 | #----------------------------------------------------------------------------# | |
214 | ||
215 | class SpamFilter(MboxFilter): | |
216 | ||
217 | def initialize(self, mbox_file): | |
218 | self.mbox_file = '%s.spam' % mbox_file | |
219 | self.mbox = mailbox.mbox(self.mbox_file) | |
220 | self.mbox.clear() | |
221 | ||
222 | def process(self, message): | |
e0c82ac8 AA |
223 | if message['X-Spam-Flag'] is not None \ |
224 | and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'): | |
58e64caf AA |
225 | self.mbox.add(message) |
226 | return True | |
227 | return False | |
228 | ||
229 | def finalize(self): | |
230 | print 'Found %d spams. This is reliable.' % len(self.mbox) | |
231 | print 'They were saved in %s.' % self.mbox_file | |
232 | print 'You might check the contents of this mbox.' | |
233 | self.mbox.close() | |
234 | ||
235 | #----------------------------------------------------------------------------# | |
236 | ||
237 | class UnsureFilter(MboxFilter): | |
238 | ||
239 | def initialize(self, mbox_file): | |
240 | self.mbox_file = '%s.unsure' % mbox_file | |
241 | self.mbox = mailbox.mbox(self.mbox_file) | |
242 | self.mbox.clear() | |
243 | ||
244 | def process(self, message): | |
e0c82ac8 AA |
245 | if message['X-Spam-Flag'] is not None \ |
246 | and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'): | |
58e64caf AA |
247 | self.mbox.add(message) |
248 | return True | |
249 | return False | |
250 | ||
251 | def finalize(self): | |
252 | print 'Found %d unclassified messages. Most of them should be spams.' % len(self.mbox) | |
253 | print 'They were saved in %s.' % self.mbox_file | |
254 | print 'You must check the contents of this mbox and feed the antispam.' | |
255 | self.mbox.close() | |
256 | ||
257 | #----------------------------------------------------------------------------# | |
258 | ||
259 | class CheckNonSpamFilter(MboxFilter): | |
260 | ||
261 | def initialize(self, mbox_file): | |
262 | self.seen = 0 | |
263 | ||
264 | def process(self, message): | |
e0c82ac8 AA |
265 | if message['X-Spam-Flag'] is None \ |
266 | or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'): | |
58e64caf AA |
267 | self.seen += 1 |
268 | return False | |
269 | ||
270 | def finalize(self): | |
271 | if self.seen > 0: | |
e0c82ac8 | 272 | print 'Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen |
58e64caf AA |
273 | print 'Please investigate.' |
274 | else: | |
275 | print 'All messages were either spam, or unsure, or non-spams. Good.' | |
276 | ||
277 | #----------------------------------------------------------------------------# | |
278 | ||
279 | class OutOfOfficeFilter(MboxFilter): | |
280 | ||
281 | def initialize(self, mbox_file): | |
282 | self.mbox_file = '%s.ooo' % mbox_file | |
283 | self.mbox = mailbox.mbox(self.mbox_file) | |
284 | self.mbox.clear() | |
285 | subject_re = [ | |
286 | r'^Absen(t|ce)', | |
aa6c6ed4 AA |
287 | r'(est|is) absent', |
288 | r'^Out of (the )?office', | |
58e64caf | 289 | r'is out of (the )?office', |
aa6c6ed4 AA |
290 | r'I am out of town', |
291 | r'automatique d\'absence', | |
292 | r'Notification d\'absence' | |
293 | u'Réponse automatique :', #unicode! | |
294 | r'AutoReply', | |
58e64caf AA |
295 | ] |
296 | self.subject_regexes = map(re.compile, subject_re, [re.I | re.U] * len(subject_re)) | |
297 | ||
298 | def process(self, message): | |
299 | subject = findSubject(message) | |
300 | if subject is not None and any(regex.search(subject) for regex in self.subject_regexes): | |
301 | self.mbox.add(message) | |
302 | return True | |
303 | return False | |
304 | ||
305 | def finalize(self): | |
306 | print 'Found %d "out of office". This is generally reliable.' % len(self.mbox) | |
307 | print 'They were saved in %s.' % self.mbox_file | |
308 | print 'You may check the contents of this mbox.' | |
309 | self.mbox.close() | |
310 | ||
311 | #----------------------------------------------------------------------------# | |
312 | ||
313 | class DeliveryStatusNotificationFilter(MboxFilter): | |
314 | ||
315 | def initialize(self, mbox_file): | |
316 | self.emails = [] | |
317 | self.mbox_file = '%s.dsn' % mbox_file | |
318 | self.mbox = mailbox.mbox(self.mbox_file) | |
319 | self.mbox.clear() | |
58e64caf AA |
320 | |
321 | def process(self, message): | |
aa6c6ed4 | 322 | if message.get_content_type() == 'multipart/report': |
58e64caf AA |
323 | email = findAddressInBounce(message) |
324 | if email is not None: | |
325 | self.emails.append(email) | |
326 | self.mbox.add(message) | |
327 | return True | |
328 | return False | |
329 | ||
330 | def finalize(self): | |
331 | print 'Found %d delivery status notifications. This is generally reliable.' % len(self.mbox) | |
332 | print 'They were saved in %s.' % self.mbox_file | |
333 | print '' | |
334 | print 'Here is the list of email adresses for these bounces:' | |
335 | print '' | |
336 | for email in self.emails: | |
337 | print email | |
338 | print '' | |
339 | self.mbox.close() | |
340 | ||
341 | #----------------------------------------------------------------------------# | |
342 | ||
343 | class CatchAllFilter(MboxFilter): | |
344 | ||
345 | def initialize(self, mbox_file): | |
346 | self.mbox_file = '%s.catchall' % mbox_file | |
347 | self.mbox = mailbox.mbox(self.mbox_file) | |
348 | self.mbox.clear() | |
349 | ||
350 | def process(self, message): | |
351 | self.mbox.add(message) | |
352 | return True | |
353 | ||
354 | def finalize(self): | |
355 | if len(self.mbox) > 0: | |
356 | print '%d messages reached the catchall.' % len(self.mbox) | |
357 | print 'They were saved in %s.' % self.mbox_file | |
358 | print 'You must process the contents of this mbox manually.' | |
359 | self.mbox.close() | |
360 | else: | |
361 | print 'No messages reached the catchall. Nice.' | |
362 | self.mbox.close() | |
363 | os.unlink(self.mbox_file) | |
364 | ||
365 | #----------------------------------------------------------------------------# | |
366 | ||
367 | if __name__ == '__main__': | |
368 | ||
369 | if len(sys.argv) != 2: | |
370 | print 'Usage: %s mbox' % sys.argv[0] | |
371 | sys.exit(1) | |
372 | ||
373 | if not os.path.exists(sys.argv[1]): | |
374 | print 'No such file: %s' % sys.argv[1] | |
375 | sys.exit(1) | |
376 | ||
377 | processor = MboxProcessor(sys.argv[1]) | |
378 | processor.run() |