3 # Copyright (C) 2008 Aymeric Augustin
4 # Released under the GPL
8 ########################################################################
10 # A random word generator using Markov chains
14 def __init__(self
, order
=3, special
=u
'\n'):
16 self
.special
= special
19 def load(self
, corpus
):
21 word
= self
.special
* self
.order
+ word
.strip() + self
.special
22 for pos
in range(len(word
) - self
.order
):
23 prefix
= word
[pos
:pos
+ self
.order
]
24 suffix
= word
[pos
+ self
.order
]
25 if not self
.markov
.has_key(prefix
):
26 self
.markov
[prefix
] = []
27 self
.markov
[prefix
].append(suffix
)
30 word
= self
.special
* self
.order
32 c
= random
.choice(self
.markov
[word
[-self
.order
:]])
34 return word
[self
.order
:]
38 ########################################################################
40 def parse_aliases(file):
44 handle
= open(file, 'r') # aliases are ASCII only
45 aliases
= handle
.readlines()
48 alias_re
= re
.compile(r
'([a-z\-]+).([a-z\-]+).([0-9]{4})')
50 alias
= alias
.rstrip()
51 match
= alias_re
.match(alias
)
53 print "Warning: could not parse alias '%s'" % alias
55 firstnames
.append(match
.group(1))
56 lastnames
.append(match
.group(2))
57 promos
.append(match
.group(3))
59 return firstnames
, lastnames
, promos
61 # Returns the index of the first value of `array' strictly greater than `value'
62 def find_next(value
, array
, pmin
=0, pmax
=-1):
63 if pmax
== -1: pmax
= len(array
)
64 if pmax
== pmin
+ 1: return pmax
65 # At every step, array[pmin] < value < array[pmax]
66 pint
= (pmin
+ pmax
) / 2
67 if array
[pint
] < value
:
68 return find_next(value
, array
, pint
, pmax
)
70 return find_next(value
, array
, pmin
, pint
)
72 def create_alias(firstname
, pred_lastname
, succ_lastname
, rand_lastnames
):
73 i_pred
= find_next(pred_lastname
, rand_lastnames
)
74 i_succ
= find_next(succ_lastname
, rand_lastnames
)
75 # We don't know the order of the names
76 if i_pred
> i_succ
: i_pred
, i_succ
= i_succ
, i_pred
79 lastname
= "%s-%s" %
(pred_lastname
, random
.choice(rand_lastnames
))
81 lastname
= rand_lastnames
[random
.randint(i_pred
, i_succ
)]
82 promo
= random
.randint(100, 999)
83 return "%s.%s.%d" %
(firstname
, lastname
, promo
)
85 ########################################################################
87 if __name__
== '__main__':
90 if len(sys
.argv
) != 3:
91 print "Usage: %s aliases poisonous" % sys
.argv
[0]
93 print "Generate the aliases file with:"
94 print "$ mysql x4dat > aliases.txt"
95 print "SELECT alias FROM aliases WHERE type = 'a_vie';"
99 # Parse the list of existing aliases and sort it
100 firstnames
, lastnames
, promos
= parse_aliases(sys
.argv
[1])
102 # Generate many virtual lastnames and sort the list
103 generator
= WordGenerator()
104 generator
.load(lastnames
)
105 rand_lastnames
= [generator
.generate() for i
in range(100 * len(lastnames
))]
106 rand_lastnames
.sort()
108 # For each original, create a new alias
109 # alphabetically between this one and the next one
110 handle
= open(sys
.argv
[2], 'w')
111 lastnames
.append('zzzzzzzz') # hack to avoid off-by-one
112 for i
in range(len(firstnames
)):
113 handle
.write(create_alias(firstnames
[i
], lastnames
[i
], lastnames
[i
+ 1], rand_lastnames
))