Moving to GitHub.
[platal.git] / bin / poisonous_email_generator.py
1 #!/usr/bin/env python
2 #***************************************************************************
3 #* Copyright (C) 2003-2014 Polytechnique.org *
4 #* http://opensource.polytechnique.org/ *
5 #* *
6 #* This program is free software; you can redistribute it and/or modify *
7 #* it under the terms of the GNU General Public License as published by *
8 #* the Free Software Foundation; either version 2 of the License, or *
9 #* (at your option) any later version. *
10 #* *
11 #* This program is distributed in the hope that it will be useful, *
12 #* but WITHOUT ANY WARRANTY; without even the implied warranty of *
13 #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14 #* GNU General Public License for more details. *
15 #* *
16 #* You should have received a copy of the GNU General Public License *
17 #* along with this program; if not, write to the Free Software *
18 #* Foundation, Inc., *
19 #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
20 #***************************************************************************
21
22 import sys, random, re
23
24 ########################################################################
25
26 # A random word generator using Markov chains
27
28 class WordGenerator:
29
30 def __init__(self, order=3, special=u'\n'):
31 self.order = order
32 self.special = special
33 self.markov = {}
34
35 def load(self, corpus):
36 for word in corpus:
37 word = self.special * self.order + word.strip() + self.special
38 for pos in range(len(word) - self.order):
39 prefix = word[pos:pos + self.order]
40 suffix = word[pos + self.order]
41 if not self.markov.has_key(prefix):
42 self.markov[prefix] = []
43 self.markov[prefix].append(suffix)
44
45 def generate(self):
46 word = self.special * self.order
47 while True:
48 c = random.choice(self.markov[word[-self.order:]])
49 if c == self.special:
50 return word[self.order:]
51 else:
52 word += c
53
54 ########################################################################
55
56 def parse_aliases(file):
57 firstnames = []
58 lastnames = []
59 promos = []
60 handle = open(file, 'r') # aliases are ASCII only
61 aliases = handle.readlines()
62 handle.close()
63 aliases.sort()
64 alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})')
65 for alias in aliases:
66 alias = alias.rstrip()
67 match = alias_re.match(alias)
68 if match is None:
69 print "Warning: could not parse alias '%s'" % alias
70 else:
71 firstnames.append(match.group(1))
72 lastnames.append(match.group(2))
73 promos.append(match.group(3))
74 handle.close()
75 return firstnames, lastnames, promos
76
77 # Returns the index of the first value of `array' strictly greater than `value'
78 def find_next(value, array, pmin=0, pmax=-1):
79 if pmax == -1: pmax = len(array)
80 if pmax == pmin + 1: return pmax
81 # At every step, array[pmin] < value < array[pmax]
82 pint = (pmin + pmax) / 2
83 if array[pint] < value:
84 return find_next(value, array, pint, pmax)
85 else:
86 return find_next(value, array, pmin, pint)
87
88 def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames):
89 i_pred = find_next(pred_lastname, rand_lastnames)
90 i_succ = find_next(succ_lastname, rand_lastnames)
91 # We don't know the order of the names
92 if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred
93 # Hack in edge case
94 if i_pred == i_succ:
95 lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames))
96 else:
97 lastname = rand_lastnames[random.randint(i_pred, i_succ)]
98 promo = random.randint(100, 999)
99 return "%s.%s.%d" % (firstname, lastname, promo)
100
101 ########################################################################
102
103 if __name__ == '__main__':
104
105 # Check arguments
106 if len(sys.argv) != 3:
107 print "Usage: %s aliases poisonous" % sys.argv[0]
108 print ""
109 print "Generate the aliases file with:"
110 print "$ mysql x4dat > aliases.txt"
111 print "SELECT alias FROM aliases WHERE type = 'a_vie';"
112 print "^D"
113 sys.exit(1)
114
115 # Parse the list of existing aliases and sort it
116 firstnames, lastnames, promos = parse_aliases(sys.argv[1])
117
118 # Generate many virtual lastnames and sort the list
119 generator = WordGenerator()
120 generator.load(lastnames)
121 rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))]
122 rand_lastnames.sort()
123
124 # For each original, create a new alias
125 # alphabetically between this one and the next one
126 handle = open(sys.argv[2], 'w')
127 lastnames.append('zzzzzzzz') # hack to avoid off-by-one
128 for i in range(len(firstnames)):
129 handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames))
130 handle.write('\n')
131 handle.close()
132
133
134