--- /dev/null
+/*\r\r
+ * put your module comment here\r\r
+ * formatted with JxBeauty (c) johann.langhofer@nextra.at\r\r
+ */\r\r
+\r\r
+package com.swabunga.spell.engine;\r\r
+\r\r
+import java.io.*;\r\r
+import java.util.*;\r\r
+\r\r
+/**\r\r
+ * The SpellDictionary class holds the instance of the dictionary.\r\r
+ * <p>\r\r
+ * This class is thread safe. Derived classes should ensure that this preserved.\r\r
+ * </p>\r\r
+ * <p>\r\r
+ * There are many open source dictionary files. For just a few see:\r\r
+ * http://wordlist.sourceforge.net/\r\r
+ * </p>\r\r
+ * <p>\r\r
+ * This dictionary class reads words one per line. Make sure that your word list\r\r
+ * is formatted in this way (most are).\r\r
+ * </p>\r\r
+ */\r\r
+public class SpellDictionary {\r\r
+\r\r
+\r\r
+ /** The replace list is used in the getSuggestions method*/\r\r
+ private static final char[] replacelist =\r\r
+ {\r\r
+ 'A',\r\r
+ 'B',\r\r
+ 'X',\r\r
+ 'S',\r\r
+ 'K',\r\r
+ 'J',\r\r
+ 'T',\r\r
+ 'F',\r\r
+ 'H',\r\r
+ 'L',\r\r
+ 'M',\r\r
+ 'N',\r\r
+ 'P',\r\r
+ 'R',\r\r
+ '0' };\r\r
+\r\r
+ /** A field indicating the initial hash map capacity (16KB) for the main\r\r
+ * dictionary hash map. Interested to see what the performance of a\r\r
+ * smaller initial capacity is like.\r\r
+ */\r\r
+ private final static int INITIAL_CAPACITY = 16 * 1024;\r\r
+ /**\r\r
+ * The hashmap that contains the word dictionary. The map is hashed on the doublemeta\r\r
+ * code. The map entry contains a LinkedList of words that have the same double meta code.\r\r
+ */\r\r
+ protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);\r\r
+ /**The reference to a Transformator, used to transform a word into it's.\r\r
+ * phonetic code.\r\r
+ */\r\r
+ private Transformator tf = null;\r\r
+\r\r
+\r\r
+ /** Holds the dictionary file for appending*/\r\r
+ private File dictFile = null;\r\r
+\r\r
+ /**\r\r
+ * Dictionary Constructor.\r\r
+ */\r\r
+ public SpellDictionary(Reader wordList) throws IOException {\r\r
+ tf = new DoubleMeta();\r\r
+ createDictionary(new BufferedReader(wordList));\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Dictionary Constructor for JAR files\r\r
+ * @author Howard Kistler\r\r
+ */\r\r
+ public SpellDictionary(String wordListResource) throws IOException\r\r
+ {\r\r
+ tf = new DoubleMeta();\r\r
+ InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);\r\r
+ createDictionary(new BufferedReader(new InputStreamReader(is)));\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Dictionary Convienence Constructor.\r\r
+ */\r\r
+ public SpellDictionary(File wordList)\r\r
+ throws FileNotFoundException, IOException {\r\r
+ this(new FileReader(wordList));\r\r
+ dictFile = wordList;\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Dictionary constructor that uses an aspell phonetic file to\r\r
+ * build the transformation table.\r\r
+ */\r\r
+ public SpellDictionary(File wordList, File phonetic)\r\r
+ throws FileNotFoundException, IOException {\r\r
+ tf = new GenericTransformator(phonetic);\r\r
+ dictFile = wordList;\r\r
+ createDictionary(new BufferedReader(new FileReader(wordList)));\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Add a word permanantly to the dictionary (and the dictionary file).\r\r
+ * <p>This needs to be made thread safe (synchronized)</p>\r\r
+ */\r\r
+ public void addWord(String word) {\r\r
+ putWord(word);\r\r
+ if (dictFile == null)\r\r
+ return;\r\r
+ try {\r\r
+ FileWriter w = new FileWriter(dictFile.toString(), true);\r\r
+ // Open with append.\r\r
+ w.write(word);\r\r
+ w.write("\n");\r\r
+ w.close();\r\r
+ } catch (IOException ex) {\r\r
+ System.out.println("Error writing to dictionary file");\r\r
+ }\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Constructs the dictionary from a word list file.\r\r
+ * <p>\r\r
+ * Each word in the reader should be on a seperate line.\r\r
+ * <p>\r\r
+ * This is a very slow function. On my machine it takes quite a while to\r\r
+ * load the data in. I suspect that we could speed this up quite alot.\r\r
+ */\r\r
+ protected void createDictionary(BufferedReader in) throws IOException {\r\r
+ String line = "";\r\r
+ while (line != null) {\r\r
+ line = in.readLine();\r\r
+ if (line != null) {\r\r
+ line = new String(line.toCharArray());\r\r
+ putWord(line);\r\r
+ }\r\r
+ }\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Returns the code representing the word.\r\r
+ */\r\r
+ public String getCode(String word) {\r\r
+ return tf.transform(word);\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Allocates a word in the dictionary\r\r
+ */\r\r
+ protected void putWord(String word) {\r\r
+ String code = getCode(word);\r\r
+ LinkedList list = (LinkedList) mainDictionary.get(code);\r\r
+ if (list != null) {\r\r
+ list.add(word);\r\r
+ } else {\r\r
+ list = new LinkedList();\r\r
+ list.add(word);\r\r
+ mainDictionary.put(code, list);\r\r
+ }\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Returns a list of strings (words) for the code.\r\r
+ */\r\r
+ public LinkedList getWords(String code) {\r\r
+ //Check the main dictionary.\r\r
+ LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);\r\r
+ if (mainDictResult == null)\r\r
+ return new LinkedList();\r\r
+ return mainDictResult;\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Returns true if the word is correctly spelled against the current word list.\r\r
+ */\r\r
+ public boolean isCorrect(String word) {\r\r
+ LinkedList possible = getWords(getCode(word));\r\r
+ if (possible.contains(word))\r\r
+ return true;\r\r
+ //JMH should we always try the lowercase version. If I dont then capitalised\r\r
+ //words are always returned as incorrect.\r\r
+ else if (possible.contains(word.toLowerCase()))\r\r
+ return true;\r\r
+ return false;\r\r
+ }\r\r
+\r\r
+ /**\r\r
+ * Returns a linked list of Word objects that are the suggestions to an\r\r
+ * incorrect word.\r\r
+ * <p>\r\r
+ * @param word Suggestions for given mispelt word\r\r
+ * @param threshold The lower boundary of similarity to mispelt word\r\r
+ * @return LinkedList a List of suggestions\r\r
+ */\r\r
+ public LinkedList getSuggestions(String word, int threshold) {\r\r
+\r\r
+ HashSet nearmisscodes = new HashSet();\r\r
+ String code = getCode(word);\r\r
+\r\r
+ // add all words that have the same codeword\r\r
+ nearmisscodes.add(code);\r\r
+\r\r
+ // do some tranformations to pick up more results\r\r
+ //interchange \r\r
+ char[] charArray = word.toCharArray();\r\r
+ for (int i = 0; i < word.length() - 1; i++) {\r\r
+ char a = charArray[i];\r\r
+ char b = charArray[i + 1];\r\r
+ charArray[i] = b;\r\r
+ charArray[i + 1] = a;\r\r
+ nearmisscodes.add(getCode(new String(charArray)));\r\r
+ charArray[i] = a;\r\r
+ charArray[i + 1] = b;\r\r
+ }\r\r
+ //change\r\r
+ charArray = word.toCharArray();\r\r
+ for (int i = 0; i < word.length(); i++) {\r\r
+ char original = charArray[i];\r\r
+ for (int j = 0; j < replacelist.length; j++) {\r\r
+ charArray[i] = replacelist[j];\r\r
+ nearmisscodes.add(getCode(new String(charArray)));\r\r
+ }\r\r
+ charArray[i] = original;\r\r
+ }\r\r
+ //add\r\r
+ charArray = (word += " ").toCharArray();\r\r
+ int iy = charArray.length - 1;\r\r
+ while (true) {\r\r
+ for (int j = 0; j < replacelist.length; j++) {\r\r
+ charArray[iy] = replacelist[j];\r\r
+ nearmisscodes.add(getCode(new String(charArray)));\r\r
+ }\r\r
+ if (iy == 0)\r\r
+ break;\r\r
+ charArray[iy] = charArray[iy - 1];\r\r
+ --iy;\r\r
+ }\r\r
+ //delete\r\r
+ word = word.trim();\r\r
+ charArray = word.toCharArray();\r\r
+ char[] charArray2 = new char[charArray.length - 1];\r\r
+ for (int ix = 0; ix < charArray2.length; ix++) {\r\r
+ charArray2[ix] = charArray[ix];\r\r
+ }\r\r
+ char a, b;\r\r
+ a = charArray[charArray.length - 1];\r\r
+ int ii = charArray2.length;\r\r
+ while (true) {\r\r
+ nearmisscodes.add(getCode(new String(charArray)));\r\r
+ if (ii == 0)\r\r
+ break;\r\r
+ b = a;\r\r
+ a = charArray2[ii - 1];\r\r
+ charArray2[ii - 1] = b;\r\r
+ --ii;\r\r
+ }\r\r
+\r\r
+ LinkedList wordlist = getWordsFromCode(word, nearmisscodes);\r\r
+ // We sort a linkedlist at the end instead of maintaining a\r\r
+ // continously sorted TreeSet because everytime you add a collection\r\r
+ // to a treeset it has to be resorted. It's better to do this operation\r\r
+ // once at the end.\r\r
+ Collections.sort( wordlist, new Word());\r\r
+ return wordlist;\r\r
+ }\r\r
+\r\r
+ private LinkedList getWordsFromCode(String word, Collection codes) {\r\r
+ Configuration config = Configuration.getConfiguration();\r\r
+ LinkedList result = new LinkedList();\r\r
+ for (Iterator i = codes.iterator(); i.hasNext();) {\r\r
+ String code = (String) i.next();\r\r
+ LinkedList simwordlist = getWords(code);\r\r
+ for (Iterator j = simwordlist.iterator(); j.hasNext();) {\r\r
+ String similar = (String) j.next();\r\r
+ int distance = EditDistance.getDistance(word, similar);\r\r
+ if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {\r\r
+ Word w = new Word(similar, distance);\r\r
+ result.add(w);\r\r
+ }\r\r
+ }\r\r
+ }\r\r
+ return result;\r\r
+ }\r\r
+\r\r
+}\r\r