| 1 | #!/usr/bin/env python |
| 2 | #*************************************************************************** |
| 3 | #* Copyright (C) 2003-2011 Polytechnique.org * |
| 4 | #* http://opensource.polytechnique.org/ * |
| 5 | #* * |
| 6 | #* This program is free software; you can redistribute it and/or modify * |
| 7 | #* it under the terms of the GNU General Public License as published by * |
| 8 | #* the Free Software Foundation; either version 2 of the License, or * |
| 9 | #* (at your option) any later version. * |
| 10 | #* * |
| 11 | #* This program is distributed in the hope that it will be useful, * |
| 12 | #* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
| 13 | #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
| 14 | #* GNU General Public License for more details. * |
| 15 | #* * |
| 16 | #* You should have received a copy of the GNU General Public License * |
| 17 | #* along with this program; if not, write to the Free Software * |
| 18 | #* Foundation, Inc., * |
| 19 | #* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * |
| 20 | #*************************************************************************** |
| 21 | |
| 22 | import sys, random, re |
| 23 | |
| 24 | ######################################################################## |
| 25 | |
| 26 | # A random word generator using Markov chains |
| 27 | |
| 28 | class WordGenerator: |
| 29 | |
| 30 | def __init__(self, order=3, special=u'\n'): |
| 31 | self.order = order |
| 32 | self.special = special |
| 33 | self.markov = {} |
| 34 | |
| 35 | def load(self, corpus): |
| 36 | for word in corpus: |
| 37 | word = self.special * self.order + word.strip() + self.special |
| 38 | for pos in range(len(word) - self.order): |
| 39 | prefix = word[pos:pos + self.order] |
| 40 | suffix = word[pos + self.order] |
| 41 | if not self.markov.has_key(prefix): |
| 42 | self.markov[prefix] = [] |
| 43 | self.markov[prefix].append(suffix) |
| 44 | |
| 45 | def generate(self): |
| 46 | word = self.special * self.order |
| 47 | while True: |
| 48 | c = random.choice(self.markov[word[-self.order:]]) |
| 49 | if c == self.special: |
| 50 | return word[self.order:] |
| 51 | else: |
| 52 | word += c |
| 53 | |
| 54 | ######################################################################## |
| 55 | |
| 56 | def parse_aliases(file): |
| 57 | firstnames = [] |
| 58 | lastnames = [] |
| 59 | promos = [] |
| 60 | handle = open(file, 'r') # aliases are ASCII only |
| 61 | aliases = handle.readlines() |
| 62 | handle.close() |
| 63 | aliases.sort() |
| 64 | alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})') |
| 65 | for alias in aliases: |
| 66 | alias = alias.rstrip() |
| 67 | match = alias_re.match(alias) |
| 68 | if match is None: |
| 69 | print "Warning: could not parse alias '%s'" % alias |
| 70 | else: |
| 71 | firstnames.append(match.group(1)) |
| 72 | lastnames.append(match.group(2)) |
| 73 | promos.append(match.group(3)) |
| 74 | handle.close() |
| 75 | return firstnames, lastnames, promos |
| 76 | |
| 77 | # Returns the index of the first value of `array' strictly greater than `value' |
| 78 | def find_next(value, array, pmin=0, pmax=-1): |
| 79 | if pmax == -1: pmax = len(array) |
| 80 | if pmax == pmin + 1: return pmax |
| 81 | # At every step, array[pmin] < value < array[pmax] |
| 82 | pint = (pmin + pmax) / 2 |
| 83 | if array[pint] < value: |
| 84 | return find_next(value, array, pint, pmax) |
| 85 | else: |
| 86 | return find_next(value, array, pmin, pint) |
| 87 | |
| 88 | def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames): |
| 89 | i_pred = find_next(pred_lastname, rand_lastnames) |
| 90 | i_succ = find_next(succ_lastname, rand_lastnames) |
| 91 | # We don't know the order of the names |
| 92 | if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred |
| 93 | # Hack in edge case |
| 94 | if i_pred == i_succ: |
| 95 | lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames)) |
| 96 | else: |
| 97 | lastname = rand_lastnames[random.randint(i_pred, i_succ)] |
| 98 | promo = random.randint(100, 999) |
| 99 | return "%s.%s.%d" % (firstname, lastname, promo) |
| 100 | |
| 101 | ######################################################################## |
| 102 | |
| 103 | if __name__ == '__main__': |
| 104 | |
| 105 | # Check arguments |
| 106 | if len(sys.argv) != 3: |
| 107 | print "Usage: %s aliases poisonous" % sys.argv[0] |
| 108 | print "" |
| 109 | print "Generate the aliases file with:" |
| 110 | print "$ mysql x4dat > aliases.txt" |
| 111 | print "SELECT alias FROM aliases WHERE type = 'a_vie';" |
| 112 | print "^D" |
| 113 | sys.exit(1) |
| 114 | |
| 115 | # Parse the list of existing aliases and sort it |
| 116 | firstnames, lastnames, promos = parse_aliases(sys.argv[1]) |
| 117 | |
| 118 | # Generate many virtual lastnames and sort the list |
| 119 | generator = WordGenerator() |
| 120 | generator.load(lastnames) |
| 121 | rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))] |
| 122 | rand_lastnames.sort() |
| 123 | |
| 124 | # For each original, create a new alias |
| 125 | # alphabetically between this one and the next one |
| 126 | handle = open(sys.argv[2], 'w') |
| 127 | lastnames.append('zzzzzzzz') # hack to avoid off-by-one |
| 128 | for i in range(len(firstnames)): |
| 129 | handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames)) |
| 130 | handle.write('\n') |
| 131 | handle.close() |
| 132 | |
| 133 | |
| 134 | |