Add mYk's poisonous email generator.
authorFlorent Bruneau <florent.bruneau@polytechnique.org>
Sun, 19 Oct 2008 12:40:43 +0000 (14:40 +0200)
committerFlorent Bruneau <florent.bruneau@polytechnique.org>
Sun, 19 Oct 2008 12:40:43 +0000 (14:40 +0200)
Signed-off-by: Florent Bruneau <florent.bruneau@polytechnique.org>
bin/poisonous_email_generator.py [new file with mode: 0755]

diff --git a/bin/poisonous_email_generator.py b/bin/poisonous_email_generator.py
new file mode 100755 (executable)
index 0000000..fb6e8eb
--- /dev/null
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2008 Aymeric Augustin
+# Released under the GPL
+
+import sys, random, re
+
+########################################################################
+
+# A random word generator using Markov chains
+
+class WordGenerator:
+
+    def __init__(self, order=3, special=u'\n'):
+        self.order = order
+        self.special = special
+        self.markov = {}
+
+    def load(self, corpus):
+        for word in corpus:
+            word = self.special * self.order + word.strip() + self.special
+            for pos in range(len(word) - self.order):
+                prefix = word[pos:pos + self.order]
+                suffix = word[pos + self.order]
+                if not self.markov.has_key(prefix):
+                    self.markov[prefix] = []
+                self.markov[prefix].append(suffix)
+
+    def generate(self):
+        word = self.special * self.order
+        while True:
+            c = random.choice(self.markov[word[-self.order:]])
+            if c == self.special:
+                return word[self.order:]
+            else:
+                word += c
+
+########################################################################
+
+def parse_aliases(file):
+    firstnames = []
+    lastnames = []
+    promos = []
+    handle = open(file, 'r') # aliases are ASCII only
+    aliases = handle.readlines()
+    handle.close()
+    aliases.sort()
+    alias_re = re.compile(r'([a-z\-]+).([a-z\-]+).([0-9]{4})')
+    for alias in aliases:
+        alias = alias.rstrip()
+        match = alias_re.match(alias)
+        if match is None:
+            print "Warning: could not parse alias '%s'" % alias
+        else:
+            firstnames.append(match.group(1))
+            lastnames.append(match.group(2))
+            promos.append(match.group(3))
+    handle.close()
+    return firstnames, lastnames, promos
+
+# Returns the index of the first value of `array' strictly greater than `value'
+def find_next(value, array, pmin=0, pmax=-1):
+    if pmax == -1: pmax = len(array)
+    if pmax == pmin + 1: return pmax
+    # At every step, array[pmin] < value < array[pmax]
+    pint = (pmin + pmax) / 2
+    if array[pint] < value:
+        return find_next(value, array, pint, pmax)
+    else:
+        return find_next(value, array, pmin, pint)
+
+def create_alias(firstname, pred_lastname, succ_lastname, rand_lastnames):
+    i_pred = find_next(pred_lastname, rand_lastnames)
+    i_succ = find_next(succ_lastname, rand_lastnames)
+    # We don't know the order of the names
+    if i_pred > i_succ: i_pred, i_succ = i_succ, i_pred
+    # Hack in edge case
+    if i_pred == i_succ:
+        lastname = "%s-%s" % (pred_lastname, random.choice(rand_lastnames))
+    else:
+        lastname = rand_lastnames[random.randint(i_pred, i_succ)]
+    promo = random.randint(100, 999)
+    return "%s.%s.%d" % (firstname, lastname, promo)
+
+########################################################################
+
+if __name__ == '__main__':
+
+    # Check arguments
+    if len(sys.argv) != 3:
+        print "Usage: %s aliases poisonous" % sys.argv[0]
+        print ""
+        print "Generate the aliases file with:"
+        print "$ mysql x4dat > aliases.txt"
+        print "SELECT alias FROM aliases WHERE type = 'a_vie';"
+        print "^D"
+        sys.exit(1)
+
+    # Parse the list of existing aliases and sort it
+    firstnames, lastnames, promos = parse_aliases(sys.argv[1])
+
+    # Generate many virtual lastnames and sort the list
+    generator = WordGenerator()
+    generator.load(lastnames)
+    rand_lastnames = [generator.generate() for i in range(100 * len(lastnames))]
+    rand_lastnames.sort()
+
+    # For each original, create a new alias
+    # alphabetically between this one and the next one
+    handle = open(sys.argv[2], 'w')
+    lastnames.append('zzzzzzzz') # hack to avoid off-by-one
+    for i in range(len(firstnames)):
+        handle.write(create_alias(firstnames[i], lastnames[i], lastnames[i + 1], rand_lastnames))
+        handle.write('\n')
+    handle.close()
+
+
+