Commit | Line | Data |
---|---|---|
e5e7a2cb FB |
1 | #!/usr/bin/env python |
2 | ||
3 | # Copyright (C) 2008 Aymeric Augustin | |
4 | # Released under the GPL | |
5 | ||
6 | import sys, random, re | |
7 | ||
8 | ######################################################################## | |
9 | ||
10 | # A random word generator using Markov chains | |
11 | ||
12 | class WordGenerator: | |
13 | ||
14 | def __init__(self, order=3, special=u'\n'): | |
15 | self.order = order | |
16 | self.special = special | |
17 | self.markov = {} | |
18 | ||
19 | def load(self, corpus): | |
20 | for word in corpus: | |
21 | word = self.special * self.order + word.strip() + self.special | |
22 | for pos in range(len(word) - self.order): | |
23 | prefix = word[pos:pos + self.order] | |
24 | suffix = word[pos + self.order] | |
25 | if not self.markov.has_key(prefix): | |
26 | self.markov[prefix] = [] | |
27 | self.markov[prefix].append(suffix) | |
28 | ||
29 | def generate(self): | |
30 | word = self.special * self.order | |
31 | while True: | |
32 | c = random.choice(self.markov[word[-self.order:]]) | |
33 | if c == self.special: | |
34 | return word[self.order:] | |
35 | else: | |
36 | word += c | |
37 | ||
38 | ######################################################################## | |
39 | ||
40 | def parse_aliases(file): | |
41 | firstnames = [] | |
42 | lastnames = [] | |
43 | promos = [] | |
44 | handle = open(file, 'r') # aliases are ASCII only | |
45 | aliases = handle.readlines() | |
46 | handle.close() | |
47 | aliases.sort() | |
48 | alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})') | |
49 | for alias in aliases: | |
50 | alias = alias.rstrip() | |
51 | match = alias_re.match(alias) | |
52 | if match is None: | |
53 | print "Warning: could not parse alias '%s'" % alias | |
54 | else: | |
55 | firstnames.append(match.group(1)) | |
56 | lastnames.append(match.group(2)) | |
57 | promos.append(match.group(3)) | |
58 | handle.close() | |
59 | return firstnames, lastnames, promos | |
60 | ||
61 | # Returns the index of the first value of `array' strictly greater than `value' | |
62 | def find_next(value, array, pmin=0, pmax=-1): | |
63 | if pmax == -1: pmax = len(array) | |
64 | if pmax == pmin + 1: return pmax | |
65 | # At every step, array[pmin] < value < array[pmax] | |
66 | pint = (pmin + pmax) / 2 | |
67 | if array[pint] < value: | |
68 | return find_next(value, array, pint, pmax) | |
69 | else: | |
70 | return find_next(value, array, pmin, pint) | |
71 | ||
72 | def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames): | |
73 | i_pred = find_next(pred_lastname, rand_lastnames) | |
74 | i_succ = find_next(succ_lastname, rand_lastnames) | |
75 | # We don't know the order of the names | |
76 | if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred | |
77 | # Hack in edge case | |
78 | if i_pred == i_succ: | |
79 | lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames)) | |
80 | else: | |
81 | lastname = rand_lastnames[random.randint(i_pred, i_succ)] | |
82 | promo = random.randint(100, 999) | |
83 | return "%s.%s.%d" % (firstname, lastname, promo) | |
84 | ||
85 | ######################################################################## | |
86 | ||
87 | if __name__ == '__main__': | |
88 | ||
89 | # Check arguments | |
90 | if len(sys.argv) != 3: | |
91 | print "Usage: %s aliases poisonous" % sys.argv[0] | |
92 | print "" | |
93 | print "Generate the aliases file with:" | |
94 | print "$ mysql x4dat > aliases.txt" | |
95 | print "SELECT alias FROM aliases WHERE type = 'a_vie';" | |
96 | print "^D" | |
97 | sys.exit(1) | |
98 | ||
99 | # Parse the list of existing aliases and sort it | |
100 | firstnames, lastnames, promos = parse_aliases(sys.argv[1]) | |
101 | ||
102 | # Generate many virtual lastnames and sort the list | |
103 | generator = WordGenerator() | |
104 | generator.load(lastnames) | |
105 | rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))] | |
106 | rand_lastnames.sort() | |
107 | ||
108 | # For each original, create a new alias | |
109 | # alphabetically between this one and the next one | |
110 | handle = open(sys.argv[2], 'w') | |
111 | lastnames.append('zzzzzzzz') # hack to avoid off-by-one | |
112 | for i in range(len(firstnames)): | |
113 | handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames)) | |
114 | handle.write('\n') | |
115 | handle.close() | |
116 | ||
117 | ||
118 |