| 1 | #!/usr/bin/env python |
| 2 | |
| 3 | # Copyright (C) 2008 Aymeric Augustin |
| 4 | # Released under the GPL |
| 5 | |
| 6 | import sys, random, re |
| 7 | |
| 8 | ######################################################################## |
| 9 | |
| 10 | # A random word generator using Markov chains |
| 11 | |
| 12 | class WordGenerator: |
| 13 | |
| 14 | def __init__(self, order=3, special=u'\n'): |
| 15 | self.order = order |
| 16 | self.special = special |
| 17 | self.markov = {} |
| 18 | |
| 19 | def load(self, corpus): |
| 20 | for word in corpus: |
| 21 | word = self.special * self.order + word.strip() + self.special |
| 22 | for pos in range(len(word) - self.order): |
| 23 | prefix = word[pos:pos + self.order] |
| 24 | suffix = word[pos + self.order] |
| 25 | if not self.markov.has_key(prefix): |
| 26 | self.markov[prefix] = [] |
| 27 | self.markov[prefix].append(suffix) |
| 28 | |
| 29 | def generate(self): |
| 30 | word = self.special * self.order |
| 31 | while True: |
| 32 | c = random.choice(self.markov[word[-self.order:]]) |
| 33 | if c == self.special: |
| 34 | return word[self.order:] |
| 35 | else: |
| 36 | word += c |
| 37 | |
| 38 | ######################################################################## |
| 39 | |
| 40 | def parse_aliases(file): |
| 41 | firstnames = [] |
| 42 | lastnames = [] |
| 43 | promos = [] |
| 44 | handle = open(file, 'r') # aliases are ASCII only |
| 45 | aliases = handle.readlines() |
| 46 | handle.close() |
| 47 | aliases.sort() |
| 48 | alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})') |
| 49 | for alias in aliases: |
| 50 | alias = alias.rstrip() |
| 51 | match = alias_re.match(alias) |
| 52 | if match is None: |
| 53 | print "Warning: could not parse alias '%s'" % alias |
| 54 | else: |
| 55 | firstnames.append(match.group(1)) |
| 56 | lastnames.append(match.group(2)) |
| 57 | promos.append(match.group(3)) |
| 58 | handle.close() |
| 59 | return firstnames, lastnames, promos |
| 60 | |
| 61 | # Returns the index of the first value of `array' strictly greater than `value' |
| 62 | def find_next(value, array, pmin=0, pmax=-1): |
| 63 | if pmax == -1: pmax = len(array) |
| 64 | if pmax == pmin + 1: return pmax |
| 65 | # At every step, array[pmin] < value < array[pmax] |
| 66 | pint = (pmin + pmax) / 2 |
| 67 | if array[pint] < value: |
| 68 | return find_next(value, array, pint, pmax) |
| 69 | else: |
| 70 | return find_next(value, array, pmin, pint) |
| 71 | |
| 72 | def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames): |
| 73 | i_pred = find_next(pred_lastname, rand_lastnames) |
| 74 | i_succ = find_next(succ_lastname, rand_lastnames) |
| 75 | # We don't know the order of the names |
| 76 | if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred |
| 77 | # Hack in edge case |
| 78 | if i_pred == i_succ: |
| 79 | lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames)) |
| 80 | else: |
| 81 | lastname = rand_lastnames[random.randint(i_pred, i_succ)] |
| 82 | promo = random.randint(100, 999) |
| 83 | return "%s.%s.%d" % (firstname, lastname, promo) |
| 84 | |
| 85 | ######################################################################## |
| 86 | |
| 87 | if __name__ == '__main__': |
| 88 | |
| 89 | # Check arguments |
| 90 | if len(sys.argv) != 3: |
| 91 | print "Usage: %s aliases poisonous" % sys.argv[0] |
| 92 | print "" |
| 93 | print "Generate the aliases file with:" |
| 94 | print "$ mysql x4dat > aliases.txt" |
| 95 | print "SELECT alias FROM aliases WHERE type = 'a_vie';" |
| 96 | print "^D" |
| 97 | sys.exit(1) |
| 98 | |
| 99 | # Parse the list of existing aliases and sort it |
| 100 | firstnames, lastnames, promos = parse_aliases(sys.argv[1]) |
| 101 | |
| 102 | # Generate many virtual lastnames and sort the list |
| 103 | generator = WordGenerator() |
| 104 | generator.load(lastnames) |
| 105 | rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))] |
| 106 | rand_lastnames.sort() |
| 107 | |
| 108 | # For each original, create a new alias |
| 109 | # alphabetically between this one and the next one |
| 110 | handle = open(sys.argv[2], 'w') |
| 111 | lastnames.append('zzzzzzzz') # hack to avoid off-by-one |
| 112 | for i in range(len(firstnames)): |
| 113 | handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames)) |
| 114 | handle.write('\n') |
| 115 | handle.close() |
| 116 | |
| 117 | |
| 118 | |