Commit | Line | Data |
---|---|---|
6208fd26 | 1 | #!/usr/bin/env python |
58e64caf AA |
2 | # -*- coding: utf-8 -*- |
3 | #*************************************************************************** | |
ba6ae046 | 4 | #* Copyright (C) 2003-2013 Polytechnique.org * |
58e64caf AA |
5 | #* http://opensource.polytechnique.org/ * |
6 | #* * | |
7 | #* This program is free software; you can redistribute it and/or modify * | |
8 | #* it under the terms of the GNU General Public License as published by * | |
9 | #* the Free Software Foundation; either version 2 of the License, or * | |
10 | #* (at your option) any later version. * | |
11 | #* * | |
12 | #* This program is distributed in the hope that it will be useful, * | |
13 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * | |
14 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * | |
15 | #* GNU General Public License for more details. * | |
16 | #* * | |
17 | #* You should have received a copy of the GNU General Public License * | |
18 | #* along with this program; if not, write to the Free Software * | |
19 | #* Foundation, Inc., * | |
20 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * | |
21 | #*************************************************************************** | |
22 | ||
58e64caf AA |
23 | """ |
24 | Process as automatically as possible bounces from the newsletter | |
25 | ||
26 | The goal is to extract the email adresses that actually bounced. | |
27 | Bounces conforming to RFC 1894 will be automatically processed. | |
28 | ||
29 | This script uses the X-Spam-Flag header to remove spam and heuristics | |
30 | to detect out-of-office auto-replies and delivery status notifications. | |
31 | ||
32 | All emails are saved in different mailboxes to make human post-processing easier. | |
33 | """ | |
34 | ||
6208fd26 NI |
35 | import email |
36 | import mailbox | |
37 | import os | |
38 | import re | |
39 | import sys | |
40 | import time | |
58e64caf AA |
41 | |
42 | #----------------------------------------------------------------------------# | |
43 | ||
44 | class MboxProcessor: | |
45 | """Applies a series of filters to each message in a mbox.""" | |
46 | ||
47 | def __init__(self, mbox): | |
48 | self.mbox_file = mbox | |
49 | self.mbox = mailbox.mbox(self.mbox_file) | |
50 | self.filters = [ | |
51 | DirectBouncesFilter(), | |
52 | SpamFilter(), | |
53 | UnsureFilter(), | |
54 | CheckNonSpamFilter(), | |
55 | OutOfOfficeFilter(), | |
56 | DeliveryStatusNotificationFilter(), | |
57 | CatchAllFilter() | |
58 | ] | |
59 | ||
60 | def initialize_filters(self): | |
61 | for f in self.filters: f.initialize(self.mbox_file) | |
62 | self.start_time = time.clock() | |
63 | ||
64 | def apply_filters(self, message): | |
65 | return any(f.process(message) for f in self.filters) | |
66 | ||
67 | def finalize_filters(self): | |
68 | duration = time.clock() - self.start_time | |
69 | separator = '-' * 80 | |
6208fd26 NI |
70 | print(separator) |
71 | print('Processed the %d messages of %s in %.2fs' % (len(self.mbox), self.mbox_file, duration)) | |
72 | print(separator) | |
58e64caf | 73 | for f in self.filters: |
6208fd26 NI |
74 | f.finalize() |
75 | print(separator) | |
58e64caf AA |
76 | |
77 | def run(self): | |
78 | self.mbox.lock() | |
79 | try: | |
80 | self.initialize_filters() | |
81 | for message in self.mbox: self.apply_filters(message) | |
82 | self.finalize_filters() | |
83 | finally: | |
84 | self.mbox.unlock() | |
85 | self.mbox.close() | |
86 | ||
87 | #----------------------------------------------------------------------------# | |
88 | ||
89 | class MboxFilter: | |
90 | """Defines an interface for filters.""" | |
91 | ||
92 | def initialize(self, mbox_file): | |
93 | """Called by the processor before processing starts. | |
6208fd26 | 94 | |
58e64caf AA |
95 | This is the place to open descriptors required during processing.""" |
96 | pass | |
97 | ||
98 | def process(self, message): | |
99 | """Called by the processor for each message that reaches this step. | |
6208fd26 | 100 | |
58e64caf AA |
101 | Return true to stop processing, and false to go to the next filter.""" |
102 | pass | |
103 | ||
104 | def finalize(self): | |
105 | """Called by the processor after processing ends. | |
6208fd26 | 106 | |
58e64caf AA |
107 | This is the place to display the results and close all descriptors.""" |
108 | pass | |
109 | ||
110 | #----------------------------------------------------------------------------# | |
111 | ||
112 | def findSubject(message): | |
113 | """Returns the subject of an email.Message as an unicode string.""" | |
6208fd26 NI |
114 | if message['Subject'] is None: |
115 | return None | |
116 | ||
117 | # decode_header returns a list of (decoded_string, charset) pairs | |
118 | decoded_seq = email.header.decode_header(message['Subject']) | |
119 | decoded_seq = [(subj, enc or 'utf-8') for subj, enc in decoded_seq] | |
120 | header = email.header.make_header(decoded_seq) | |
121 | # Be Python 2 & 3 compatible | |
122 | return unicode(header) if sys.version_info < (3,) else str(header) | |
123 | ||
58e64caf AA |
124 | |
125 | _recipient_re = re.compile(r'^rfc822; ?(.+)$', re.I | re.U) | |
126 | ||
6208fd26 | 127 | |
58e64caf AA |
128 | def findAddressInBounce(bounce): |
129 | """Finds the faulty email address in a bounced email. | |
6208fd26 | 130 | |
58e64caf AA |
131 | See RFC 1894 for more information. |
132 | Returns None or the email address.""" | |
6208fd26 | 133 | |
58e64caf AA |
134 | # Check that it is a bounce - a few MTA fail to set this correctly :( |
135 | if bounce.get_content_type() != 'multipart/report': | |
6208fd26 | 136 | print('! Not a valid bounce (expected multipart/report, found %s).' % bounce.get_content_type()) |
58e64caf AA |
137 | return None |
138 | # Extract the second component of the multipart/report | |
aa6c6ed4 AA |
139 | num_payloads = len(bounce.get_payload()) |
140 | if num_payloads < 2: | |
6208fd26 | 141 | print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads) |
58e64caf AA |
142 | return None |
143 | status = bounce.get_payload(1) | |
144 | if status.get_content_type() != 'message/delivery-status': | |
4aad6c9c | 145 | print('! Not a valid bounce (expected message/delivery-status, found %s).' % status.get_content_type()) |
58e64caf AA |
146 | return None |
147 | # The per-message-fields don't matter here, get only the per-recipient-fields | |
aa6c6ed4 AA |
148 | num_payloads = len(status.get_payload()) |
149 | if num_payloads < 2: | |
6208fd26 | 150 | print('! Not a valid bounce (expected at least 2 parts, found %d).' % num_payloads) |
58e64caf AA |
151 | return None |
152 | content = status.get_payload(1) | |
153 | if content.get_content_type() != 'text/plain': | |
4aad6c9c | 154 | print('! Not a valid bounce (expected text/plain, found %s).' % content.get_content_type()) |
58e64caf AA |
155 | return None |
156 | # Extract the faulty email address | |
157 | recipient_match = _recipient_re.search(content['Final-Recipient']) | |
158 | if recipient_match is None: | |
6208fd26 | 159 | print('! Missing final recipient.') |
58e64caf AA |
160 | return None |
161 | email = recipient_match.group(1) | |
162 | # Check the action field | |
4aad6c9c | 163 | if content['Action'].lower() != 'failed': |
6208fd26 | 164 | print('! Not a failed action (%s).' % content['Action']) |
58e64caf AA |
165 | return None |
166 | # Mail forwarding loops, DNS errors and connection timeouts cause X-Postfix errors | |
167 | # Otherwise, the first sub-field should indicate a permanent failure | |
168 | postfix_error = content['Diagnostic-Code'] is not None \ | |
169 | and content['Diagnostic-Code'].startswith('X-Postfix') | |
170 | if not postfix_error and int(content['Status'][:1]) != 5: | |
6208fd26 | 171 | print('! Not a permanent failure status (%s).' % content['Status']) |
58e64caf AA |
172 | return None |
173 | return email | |
174 | ||
175 | #----------------------------------------------------------------------------# | |
176 | ||
177 | class DirectBouncesFilter(MboxFilter): | |
178 | ||
179 | def initialize(self, mbox_file): | |
180 | self.seen = 0 | |
6208fd26 | 181 | self.bad_problems = 0 |
58e64caf AA |
182 | self.emails = [] |
183 | self.mbox_file = '%s.bounced' % mbox_file | |
184 | self.mbox = mailbox.mbox(self.mbox_file) | |
185 | self.mbox.clear() | |
186 | ||
187 | def process(self, message): | |
188 | if message['X-Spam-Flag'] is None: | |
189 | # During finalization, we will verifiy that all messages were processed | |
190 | self.seen += 1 | |
e0c82ac8 AA |
191 | # Special case: ignore mailman notifications for the mailing-list |
192 | # on which the NL is forwarded | |
6208fd26 NI |
193 | if message['From'] == 'newsletter-externes-bounces@polytechnique.org': |
194 | print('! Dropping a notification from mailman for newsletter-externes@polytechnique.org, this should be OK.') | |
e0c82ac8 AA |
195 | self.seen -= 1 |
196 | return True | |
58e64caf | 197 | # Additionnal checks, just to be sure |
e0c82ac8 | 198 | elif message['From'] != 'MAILER-DAEMON@polytechnique.org (Mail Delivery System)' \ |
58e64caf | 199 | or message['Subject'] != 'Undelivered Mail Returned to Sender': |
6208fd26 | 200 | print('! Not an usual direct bounce (From="%s", Subject="%s").' % (message['From'], message['Subject'])) |
e0c82ac8 AA |
201 | else: |
202 | email = findAddressInBounce(message) | |
203 | if email is not None: | |
204 | self.emails.append(email) | |
205 | self.mbox.add(message) | |
206 | return True | |
207 | else: | |
6208fd26 NI |
208 | print('! => No email found in direct bounce, this is really bad.') |
209 | self.bad_problems += 1 | |
58e64caf AA |
210 | return False |
211 | ||
212 | def finalize(self): | |
6208fd26 NI |
213 | print('Found %d messages with no X-Spam-Flag header.' % self.seen) |
214 | print('Found %d of them that are confirmed bounces.' % len(self.mbox)) | |
215 | print('They were saved in %s.' % self.mbox_file) | |
216 | if self.bad_problems: | |
217 | print('Found %d of them that are invalid.' % self.bad_problems) | |
218 | if self.seen != len(self.mbox) + self.bad_problems: | |
219 | print(' /!\ These numbers shoud be equal! We have a problem! /!\\') | |
220 | print('') | |
221 | print('Here is the list of email adresses for these bounces:') | |
222 | print('') | |
58e64caf | 223 | for email in self.emails: |
6208fd26 NI |
224 | print(email) |
225 | print('') | |
58e64caf AA |
226 | self.mbox.close() |
227 | ||
228 | #----------------------------------------------------------------------------# | |
229 | ||
230 | class SpamFilter(MboxFilter): | |
231 | ||
232 | def initialize(self, mbox_file): | |
233 | self.mbox_file = '%s.spam' % mbox_file | |
234 | self.mbox = mailbox.mbox(self.mbox_file) | |
235 | self.mbox.clear() | |
236 | ||
237 | def process(self, message): | |
e0c82ac8 AA |
238 | if message['X-Spam-Flag'] is not None \ |
239 | and message['X-Spam-Flag'].startswith('Yes, tests=bogofilter'): | |
58e64caf AA |
240 | self.mbox.add(message) |
241 | return True | |
242 | return False | |
243 | ||
244 | def finalize(self): | |
6208fd26 NI |
245 | print('Found %d spams. This is reliable.' % len(self.mbox)) |
246 | print('They were saved in %s.' % self.mbox_file) | |
247 | print('You might check the contents of this mbox.') | |
58e64caf AA |
248 | self.mbox.close() |
249 | ||
250 | #----------------------------------------------------------------------------# | |
251 | ||
252 | class UnsureFilter(MboxFilter): | |
253 | ||
254 | def initialize(self, mbox_file): | |
255 | self.mbox_file = '%s.unsure' % mbox_file | |
256 | self.mbox = mailbox.mbox(self.mbox_file) | |
257 | self.mbox.clear() | |
258 | ||
259 | def process(self, message): | |
e0c82ac8 AA |
260 | if message['X-Spam-Flag'] is not None \ |
261 | and message['X-Spam-Flag'].startswith('Unsure, tests=bogofilter'): | |
58e64caf AA |
262 | self.mbox.add(message) |
263 | return True | |
264 | return False | |
265 | ||
266 | def finalize(self): | |
6208fd26 NI |
267 | print('Found %d unclassified messages. Most of them should be spams.' % len(self.mbox)) |
268 | print('They were saved in %s.' % self.mbox_file) | |
269 | print('You must check the contents of this mbox and feed the antispam.') | |
58e64caf AA |
270 | self.mbox.close() |
271 | ||
272 | #----------------------------------------------------------------------------# | |
273 | ||
274 | class CheckNonSpamFilter(MboxFilter): | |
275 | ||
276 | def initialize(self, mbox_file): | |
277 | self.seen = 0 | |
278 | ||
279 | def process(self, message): | |
e0c82ac8 AA |
280 | if message['X-Spam-Flag'] is None \ |
281 | or not message['X-Spam-Flag'].startswith('No, tests=bogofilter'): | |
58e64caf AA |
282 | self.seen += 1 |
283 | return False | |
284 | ||
285 | def finalize(self): | |
286 | if self.seen > 0: | |
6208fd26 NI |
287 | print('Encountered %d messages that were neither spam, nor unsure, nor non-spams.' % self.seen) |
288 | print('Please investigate.') | |
58e64caf | 289 | else: |
6208fd26 | 290 | print('All messages were either spam, or unsure, or non-spams. Good.') |
58e64caf AA |
291 | |
292 | #----------------------------------------------------------------------------# | |
293 | ||
294 | class OutOfOfficeFilter(MboxFilter): | |
295 | ||
296 | def initialize(self, mbox_file): | |
297 | self.mbox_file = '%s.ooo' % mbox_file | |
298 | self.mbox = mailbox.mbox(self.mbox_file) | |
299 | self.mbox.clear() | |
300 | subject_re = [ | |
301 | r'^Absen(t|ce)', | |
6208fd26 NI |
302 | r'^(AUTO: )?Out of (the )?office', |
303 | r'^Automatic reply: ', | |
aa6c6ed4 | 304 | r'automatique d\'absence', |
aa6c6ed4 | 305 | r'AutoReply', |
6208fd26 NI |
306 | r'(est|is) absent', |
307 | r'I am out of town', | |
308 | r'I am currently away', | |
309 | r'is out of (the )?office', | |
310 | r'Notification d\'absence', | |
311 | r'R.{1,2}ponse automatique( :)?', # There may be encoding error of e acute | |
58e64caf | 312 | ] |
6208fd26 | 313 | self.subject_regexes = [re.compile(sre, re.I | re.U) for sre in subject_re] |
58e64caf AA |
314 | |
315 | def process(self, message): | |
316 | subject = findSubject(message) | |
317 | if subject is not None and any(regex.search(subject) for regex in self.subject_regexes): | |
318 | self.mbox.add(message) | |
319 | return True | |
320 | return False | |
321 | ||
322 | def finalize(self): | |
6208fd26 NI |
323 | print('Found %d "out of office". This is generally reliable.' % len(self.mbox)) |
324 | print('They were saved in %s.' % self.mbox_file) | |
325 | print('You may check the contents of this mbox.') | |
58e64caf AA |
326 | self.mbox.close() |
327 | ||
328 | #----------------------------------------------------------------------------# | |
329 | ||
330 | class DeliveryStatusNotificationFilter(MboxFilter): | |
331 | ||
332 | def initialize(self, mbox_file): | |
333 | self.emails = [] | |
334 | self.mbox_file = '%s.dsn' % mbox_file | |
335 | self.mbox = mailbox.mbox(self.mbox_file) | |
336 | self.mbox.clear() | |
6208fd26 NI |
337 | self.mbox_temp_file = '%s.dsn-temp' % mbox_file |
338 | self.mbox_temp = mailbox.mbox(self.mbox_temp_file) | |
339 | self.mbox_temp.clear() | |
58e64caf AA |
340 | |
341 | def process(self, message): | |
aa6c6ed4 | 342 | if message.get_content_type() == 'multipart/report': |
58e64caf AA |
343 | email = findAddressInBounce(message) |
344 | if email is not None: | |
345 | self.emails.append(email) | |
346 | self.mbox.add(message) | |
347 | return True | |
6208fd26 NI |
348 | else: |
349 | print("! => Moved to temporary DSN mailbox") | |
350 | self.mbox_temp.add(message) | |
351 | return True | |
58e64caf AA |
352 | return False |
353 | ||
354 | def finalize(self): | |
6208fd26 NI |
355 | print('Found %d delivery status notifications. This is generally reliable.' % len(self.mbox)) |
356 | print('They were saved in %s.' % self.mbox_file) | |
357 | print('') | |
358 | print('Here is the list of email adresses for these bounces:') | |
359 | print('') | |
58e64caf | 360 | for email in self.emails: |
6208fd26 NI |
361 | print(email) |
362 | print('') | |
58e64caf | 363 | self.mbox.close() |
6208fd26 NI |
364 | print('Found %d temporary and invalid delivery status notifications.' % len(self.mbox_temp)) |
365 | print('They were saved in %s.' % self.mbox_temp_file) | |
366 | self.mbox_temp.close() | |
58e64caf AA |
367 | |
368 | #----------------------------------------------------------------------------# | |
369 | ||
370 | class CatchAllFilter(MboxFilter): | |
371 | ||
372 | def initialize(self, mbox_file): | |
373 | self.mbox_file = '%s.catchall' % mbox_file | |
374 | self.mbox = mailbox.mbox(self.mbox_file) | |
375 | self.mbox.clear() | |
376 | ||
377 | def process(self, message): | |
378 | self.mbox.add(message) | |
379 | return True | |
380 | ||
381 | def finalize(self): | |
382 | if len(self.mbox) > 0: | |
6208fd26 NI |
383 | print('%d messages reached the catchall.' % len(self.mbox)) |
384 | print('They were saved in %s.' % self.mbox_file) | |
385 | print('You must process the contents of this mbox manually.') | |
58e64caf AA |
386 | self.mbox.close() |
387 | else: | |
6208fd26 | 388 | print('No messages reached the catchall. Nice.') |
58e64caf AA |
389 | self.mbox.close() |
390 | os.unlink(self.mbox_file) | |
391 | ||
392 | #----------------------------------------------------------------------------# | |
393 | ||
394 | if __name__ == '__main__': | |
395 | ||
396 | if len(sys.argv) != 2: | |
6208fd26 | 397 | print('Usage: %s mbox' % sys.argv[0]) |
58e64caf AA |
398 | sys.exit(1) |
399 | ||
400 | if not os.path.exists(sys.argv[1]): | |
6208fd26 | 401 | print('No such file: %s' % sys.argv[1]) |
58e64caf AA |
402 | sys.exit(1) |
403 | ||
404 | processor = MboxProcessor(sys.argv[1]) | |
405 | processor.run() |