X-Git-Url: http://git.polytechnique.org/?a=blobdiff_plain;ds=sidebyside;f=ekit%2Fcom%2Fswabunga%2Fspell%2Fengine%2FSpellDictionary.java;fp=ekit%2Fcom%2Fswabunga%2Fspell%2Fengine%2FSpellDictionary.java;h=19b35321a630a1123d6169db1df193864b9d5f89;hb=6dd702802493556cff5e55eb4982d23e79a30832;hp=0000000000000000000000000000000000000000;hpb=c47fa64d719ab7f5062a87fbf5cc55a7916c5c53;p=old-projects.git diff --git a/ekit/com/swabunga/spell/engine/SpellDictionary.java b/ekit/com/swabunga/spell/engine/SpellDictionary.java new file mode 100644 index 0000000..19b3532 --- /dev/null +++ b/ekit/com/swabunga/spell/engine/SpellDictionary.java @@ -0,0 +1,288 @@ +/* + * put your module comment here + * formatted with JxBeauty (c) johann.langhofer@nextra.at + */ + +package com.swabunga.spell.engine; + +import java.io.*; +import java.util.*; + +/** + * The SpellDictionary class holds the instance of the dictionary. + *

+ * This class is thread safe. Derived classes should ensure that this preserved. + *

+ *

+ * There are many open source dictionary files. For just a few see: + * http://wordlist.sourceforge.net/ + *

+ *

+ * This dictionary class reads words one per line. Make sure that your word list + * is formatted in this way (most are). + *

+ */ +public class SpellDictionary { + + + /** The replace list is used in the getSuggestions method*/ + private static final char[] replacelist = + { + 'A', + 'B', + 'X', + 'S', + 'K', + 'J', + 'T', + 'F', + 'H', + 'L', + 'M', + 'N', + 'P', + 'R', + '0' }; + + /** A field indicating the initial hash map capacity (16KB) for the main + * dictionary hash map. Interested to see what the performance of a + * smaller initial capacity is like. + */ + private final static int INITIAL_CAPACITY = 16 * 1024; + /** + * The hashmap that contains the word dictionary. The map is hashed on the doublemeta + * code. The map entry contains a LinkedList of words that have the same double meta code. + */ + protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY); + /**The reference to a Transformator, used to transform a word into it's. + * phonetic code. + */ + private Transformator tf = null; + + + /** Holds the dictionary file for appending*/ + private File dictFile = null; + + /** + * Dictionary Constructor. + */ + public SpellDictionary(Reader wordList) throws IOException { + tf = new DoubleMeta(); + createDictionary(new BufferedReader(wordList)); + } + + /** + * Dictionary Constructor for JAR files + * @author Howard Kistler + */ + public SpellDictionary(String wordListResource) throws IOException + { + tf = new DoubleMeta(); + InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource); + createDictionary(new BufferedReader(new InputStreamReader(is))); + } + + /** + * Dictionary Convienence Constructor. + */ + public SpellDictionary(File wordList) + throws FileNotFoundException, IOException { + this(new FileReader(wordList)); + dictFile = wordList; + } + + /** + * Dictionary constructor that uses an aspell phonetic file to + * build the transformation table. + */ + public SpellDictionary(File wordList, File phonetic) + throws FileNotFoundException, IOException { + tf = new GenericTransformator(phonetic); + dictFile = wordList; + createDictionary(new BufferedReader(new FileReader(wordList))); + } + + /** + * Add a word permanantly to the dictionary (and the dictionary file). + *

This needs to be made thread safe (synchronized)

+ */ + public void addWord(String word) { + putWord(word); + if (dictFile == null) + return; + try { + FileWriter w = new FileWriter(dictFile.toString(), true); + // Open with append. + w.write(word); + w.write("\n"); + w.close(); + } catch (IOException ex) { + System.out.println("Error writing to dictionary file"); + } + } + + /** + * Constructs the dictionary from a word list file. + *

+ * Each word in the reader should be on a seperate line. + *

+ * This is a very slow function. On my machine it takes quite a while to + * load the data in. I suspect that we could speed this up quite alot. + */ + protected void createDictionary(BufferedReader in) throws IOException { + String line = ""; + while (line != null) { + line = in.readLine(); + if (line != null) { + line = new String(line.toCharArray()); + putWord(line); + } + } + } + + /** + * Returns the code representing the word. + */ + public String getCode(String word) { + return tf.transform(word); + } + + /** + * Allocates a word in the dictionary + */ + protected void putWord(String word) { + String code = getCode(word); + LinkedList list = (LinkedList) mainDictionary.get(code); + if (list != null) { + list.add(word); + } else { + list = new LinkedList(); + list.add(word); + mainDictionary.put(code, list); + } + } + + /** + * Returns a list of strings (words) for the code. + */ + public LinkedList getWords(String code) { + //Check the main dictionary. + LinkedList mainDictResult = (LinkedList) mainDictionary.get(code); + if (mainDictResult == null) + return new LinkedList(); + return mainDictResult; + } + + /** + * Returns true if the word is correctly spelled against the current word list. + */ + public boolean isCorrect(String word) { + LinkedList possible = getWords(getCode(word)); + if (possible.contains(word)) + return true; + //JMH should we always try the lowercase version. If I dont then capitalised + //words are always returned as incorrect. + else if (possible.contains(word.toLowerCase())) + return true; + return false; + } + + /** + * Returns a linked list of Word objects that are the suggestions to an + * incorrect word. + *

+ * @param word Suggestions for given mispelt word + * @param threshold The lower boundary of similarity to mispelt word + * @return LinkedList a List of suggestions + */ + public LinkedList getSuggestions(String word, int threshold) { + + HashSet nearmisscodes = new HashSet(); + String code = getCode(word); + + // add all words that have the same codeword + nearmisscodes.add(code); + + // do some tranformations to pick up more results + //interchange + char[] charArray = word.toCharArray(); + for (int i = 0; i < word.length() - 1; i++) { + char a = charArray[i]; + char b = charArray[i + 1]; + charArray[i] = b; + charArray[i + 1] = a; + nearmisscodes.add(getCode(new String(charArray))); + charArray[i] = a; + charArray[i + 1] = b; + } + //change + charArray = word.toCharArray(); + for (int i = 0; i < word.length(); i++) { + char original = charArray[i]; + for (int j = 0; j < replacelist.length; j++) { + charArray[i] = replacelist[j]; + nearmisscodes.add(getCode(new String(charArray))); + } + charArray[i] = original; + } + //add + charArray = (word += " ").toCharArray(); + int iy = charArray.length - 1; + while (true) { + for (int j = 0; j < replacelist.length; j++) { + charArray[iy] = replacelist[j]; + nearmisscodes.add(getCode(new String(charArray))); + } + if (iy == 0) + break; + charArray[iy] = charArray[iy - 1]; + --iy; + } + //delete + word = word.trim(); + charArray = word.toCharArray(); + char[] charArray2 = new char[charArray.length - 1]; + for (int ix = 0; ix < charArray2.length; ix++) { + charArray2[ix] = charArray[ix]; + } + char a, b; + a = charArray[charArray.length - 1]; + int ii = charArray2.length; + while (true) { + nearmisscodes.add(getCode(new String(charArray))); + if (ii == 0) + break; + b = a; + a = charArray2[ii - 1]; + charArray2[ii - 1] = b; + --ii; + } + + LinkedList wordlist = getWordsFromCode(word, nearmisscodes); + // We sort a linkedlist at the end instead of maintaining a + // continously sorted TreeSet because everytime you add a collection + // to a treeset it has to be resorted. It's better to do this operation + // once at the end. + Collections.sort( wordlist, new Word()); + return wordlist; + } + + private LinkedList getWordsFromCode(String word, Collection codes) { + Configuration config = Configuration.getConfiguration(); + LinkedList result = new LinkedList(); + for (Iterator i = codes.iterator(); i.hasNext();) { + String code = (String) i.next(); + LinkedList simwordlist = getWords(code); + for (Iterator j = simwordlist.iterator(); j.hasNext();) { + String similar = (String) j.next(); + int distance = EditDistance.getDistance(word, similar); + if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) { + Word w = new Word(similar, distance); + result.add(w); + } + } + } + return result; + } + +}