Commit | Line | Data |
---|---|---|
e5e7a2cb | 1 | #!/usr/bin/env python |
9f5bd98e | 2 | #*************************************************************************** |
c441aabe | 3 | #* Copyright (C) 2003-2014 Polytechnique.org * |
9f5bd98e SJ |
4 | #* http://opensource.polytechnique.org/ * |
5 | #* * | |
6 | #* This program is free software; you can redistribute it and/or modify * | |
7 | #* it under the terms of the GNU General Public License as published by * | |
8 | #* the Free Software Foundation; either version 2 of the License, or * | |
9 | #* (at your option) any later version. * | |
10 | #* * | |
11 | #* This program is distributed in the hope that it will be useful, * | |
12 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * | |
13 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * | |
14 | #* GNU General Public License for more details. * | |
15 | #* * | |
16 | #* You should have received a copy of the GNU General Public License * | |
17 | #* along with this program; if not, write to the Free Software * | |
18 | #* Foundation, Inc., * | |
19 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * | |
20 | #*************************************************************************** | |
e5e7a2cb FB |
21 | |
22 | import sys, random, re | |
23 | ||
24 | ######################################################################## | |
25 | ||
26 | # A random word generator using Markov chains | |
27 | ||
28 | class WordGenerator: | |
29 | ||
30 | def __init__(self, order=3, special=u'\n'): | |
31 | self.order = order | |
32 | self.special = special | |
33 | self.markov = {} | |
34 | ||
35 | def load(self, corpus): | |
36 | for word in corpus: | |
37 | word = self.special * self.order + word.strip() + self.special | |
38 | for pos in range(len(word) - self.order): | |
39 | prefix = word[pos:pos + self.order] | |
40 | suffix = word[pos + self.order] | |
41 | if not self.markov.has_key(prefix): | |
42 | self.markov[prefix] = [] | |
43 | self.markov[prefix].append(suffix) | |
44 | ||
45 | def generate(self): | |
46 | word = self.special * self.order | |
47 | while True: | |
48 | c = random.choice(self.markov[word[-self.order:]]) | |
49 | if c == self.special: | |
50 | return word[self.order:] | |
51 | else: | |
52 | word += c | |
53 | ||
54 | ######################################################################## | |
55 | ||
56 | def parse_aliases(file): | |
57 | firstnames = [] | |
58 | lastnames = [] | |
59 | promos = [] | |
60 | handle = open(file, 'r') # aliases are ASCII only | |
61 | aliases = handle.readlines() | |
62 | handle.close() | |
63 | aliases.sort() | |
64 | alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})') | |
65 | for alias in aliases: | |
66 | alias = alias.rstrip() | |
67 | match = alias_re.match(alias) | |
68 | if match is None: | |
69 | print "Warning: could not parse alias '%s'" % alias | |
70 | else: | |
71 | firstnames.append(match.group(1)) | |
72 | lastnames.append(match.group(2)) | |
73 | promos.append(match.group(3)) | |
74 | handle.close() | |
75 | return firstnames, lastnames, promos | |
76 | ||
77 | # Returns the index of the first value of `array' strictly greater than `value' | |
78 | def find_next(value, array, pmin=0, pmax=-1): | |
79 | if pmax == -1: pmax = len(array) | |
80 | if pmax == pmin + 1: return pmax | |
81 | # At every step, array[pmin] < value < array[pmax] | |
82 | pint = (pmin + pmax) / 2 | |
83 | if array[pint] < value: | |
84 | return find_next(value, array, pint, pmax) | |
85 | else: | |
86 | return find_next(value, array, pmin, pint) | |
87 | ||
88 | def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames): | |
89 | i_pred = find_next(pred_lastname, rand_lastnames) | |
90 | i_succ = find_next(succ_lastname, rand_lastnames) | |
91 | # We don't know the order of the names | |
92 | if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred | |
93 | # Hack in edge case | |
94 | if i_pred == i_succ: | |
95 | lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames)) | |
96 | else: | |
97 | lastname = rand_lastnames[random.randint(i_pred, i_succ)] | |
98 | promo = random.randint(100, 999) | |
99 | return "%s.%s.%d" % (firstname, lastname, promo) | |
100 | ||
101 | ######################################################################## | |
102 | ||
103 | if __name__ == '__main__': | |
104 | ||
105 | # Check arguments | |
106 | if len(sys.argv) != 3: | |
107 | print "Usage: %s aliases poisonous" % sys.argv[0] | |
108 | print "" | |
109 | print "Generate the aliases file with:" | |
110 | print "$ mysql x4dat > aliases.txt" | |
111 | print "SELECT alias FROM aliases WHERE type = 'a_vie';" | |
112 | print "^D" | |
113 | sys.exit(1) | |
114 | ||
115 | # Parse the list of existing aliases and sort it | |
116 | firstnames, lastnames, promos = parse_aliases(sys.argv[1]) | |
117 | ||
118 | # Generate many virtual lastnames and sort the list | |
119 | generator = WordGenerator() | |
120 | generator.load(lastnames) | |
121 | rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))] | |
122 | rand_lastnames.sort() | |
123 | ||
124 | # For each original, create a new alias | |
125 | # alphabetically between this one and the next one | |
126 | handle = open(sys.argv[2], 'w') | |
127 | lastnames.append('zzzzzzzz') # hack to avoid off-by-one | |
128 | for i in range(len(firstnames)): | |
129 | handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames)) | |
130 | handle.write('\n') | |
131 | handle.close() | |
132 | ||
133 | ||
134 |