| 1 | /*\r\r |
| 2 | * put your module comment here\r\r |
| 3 | * formatted with JxBeauty (c) johann.langhofer@nextra.at\r\r |
| 4 | */\r\r |
| 5 | \r\r |
| 6 | package com.swabunga.spell.engine;\r\r |
| 7 | \r\r |
| 8 | import java.io.*;\r\r |
| 9 | import java.util.*;\r\r |
| 10 | \r\r |
| 11 | /**\r\r |
| 12 | * The SpellDictionary class holds the instance of the dictionary.\r\r |
| 13 | * <p>\r\r |
| 14 | * This class is thread safe. Derived classes should ensure that this preserved.\r\r |
| 15 | * </p>\r\r |
| 16 | * <p>\r\r |
| 17 | * There are many open source dictionary files. For just a few see:\r\r |
| 18 | * http://wordlist.sourceforge.net/\r\r |
| 19 | * </p>\r\r |
| 20 | * <p>\r\r |
| 21 | * This dictionary class reads words one per line. Make sure that your word list\r\r |
| 22 | * is formatted in this way (most are).\r\r |
| 23 | * </p>\r\r |
| 24 | */\r\r |
| 25 | public class SpellDictionary {\r\r |
| 26 | \r\r |
| 27 | \r\r |
| 28 | /** The replace list is used in the getSuggestions method*/\r\r |
| 29 | private static final char[] replacelist =\r\r |
| 30 | {\r\r |
| 31 | 'A',\r\r |
| 32 | 'B',\r\r |
| 33 | 'X',\r\r |
| 34 | 'S',\r\r |
| 35 | 'K',\r\r |
| 36 | 'J',\r\r |
| 37 | 'T',\r\r |
| 38 | 'F',\r\r |
| 39 | 'H',\r\r |
| 40 | 'L',\r\r |
| 41 | 'M',\r\r |
| 42 | 'N',\r\r |
| 43 | 'P',\r\r |
| 44 | 'R',\r\r |
| 45 | '0' };\r\r |
| 46 | \r\r |
| 47 | /** A field indicating the initial hash map capacity (16KB) for the main\r\r |
| 48 | * dictionary hash map. Interested to see what the performance of a\r\r |
| 49 | * smaller initial capacity is like.\r\r |
| 50 | */\r\r |
| 51 | private final static int INITIAL_CAPACITY = 16 * 1024;\r\r |
| 52 | /**\r\r |
| 53 | * The hashmap that contains the word dictionary. The map is hashed on the doublemeta\r\r |
| 54 | * code. The map entry contains a LinkedList of words that have the same double meta code.\r\r |
| 55 | */\r\r |
| 56 | protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);\r\r |
| 57 | /**The reference to a Transformator, used to transform a word into it's.\r\r |
| 58 | * phonetic code.\r\r |
| 59 | */\r\r |
| 60 | private Transformator tf = null;\r\r |
| 61 | \r\r |
| 62 | \r\r |
| 63 | /** Holds the dictionary file for appending*/\r\r |
| 64 | private File dictFile = null;\r\r |
| 65 | \r\r |
| 66 | /**\r\r |
| 67 | * Dictionary Constructor.\r\r |
| 68 | */\r\r |
| 69 | public SpellDictionary(Reader wordList) throws IOException {\r\r |
| 70 | tf = new DoubleMeta();\r\r |
| 71 | createDictionary(new BufferedReader(wordList));\r\r |
| 72 | }\r\r |
| 73 | \r\r |
| 74 | /**\r\r |
| 75 | * Dictionary Constructor for JAR files\r\r |
| 76 | * @author Howard Kistler\r\r |
| 77 | */\r\r |
| 78 | public SpellDictionary(String wordListResource) throws IOException\r\r |
| 79 | {\r\r |
| 80 | tf = new DoubleMeta();\r\r |
| 81 | InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);\r\r |
| 82 | createDictionary(new BufferedReader(new InputStreamReader(is)));\r\r |
| 83 | }\r\r |
| 84 | \r\r |
| 85 | /**\r\r |
| 86 | * Dictionary Convienence Constructor.\r\r |
| 87 | */\r\r |
| 88 | public SpellDictionary(File wordList)\r\r |
| 89 | throws FileNotFoundException, IOException {\r\r |
| 90 | this(new FileReader(wordList));\r\r |
| 91 | dictFile = wordList;\r\r |
| 92 | }\r\r |
| 93 | \r\r |
| 94 | /**\r\r |
| 95 | * Dictionary constructor that uses an aspell phonetic file to\r\r |
| 96 | * build the transformation table.\r\r |
| 97 | */\r\r |
| 98 | public SpellDictionary(File wordList, File phonetic)\r\r |
| 99 | throws FileNotFoundException, IOException {\r\r |
| 100 | tf = new GenericTransformator(phonetic);\r\r |
| 101 | dictFile = wordList;\r\r |
| 102 | createDictionary(new BufferedReader(new FileReader(wordList)));\r\r |
| 103 | }\r\r |
| 104 | \r\r |
| 105 | /**\r\r |
| 106 | * Add a word permanantly to the dictionary (and the dictionary file).\r\r |
| 107 | * <p>This needs to be made thread safe (synchronized)</p>\r\r |
| 108 | */\r\r |
| 109 | public void addWord(String word) {\r\r |
| 110 | putWord(word);\r\r |
| 111 | if (dictFile == null)\r\r |
| 112 | return;\r\r |
| 113 | try {\r\r |
| 114 | FileWriter w = new FileWriter(dictFile.toString(), true);\r\r |
| 115 | // Open with append.\r\r |
| 116 | w.write(word);\r\r |
| 117 | w.write("\n");\r\r |
| 118 | w.close();\r\r |
| 119 | } catch (IOException ex) {\r\r |
| 120 | System.out.println("Error writing to dictionary file");\r\r |
| 121 | }\r\r |
| 122 | }\r\r |
| 123 | \r\r |
| 124 | /**\r\r |
| 125 | * Constructs the dictionary from a word list file.\r\r |
| 126 | * <p>\r\r |
| 127 | * Each word in the reader should be on a seperate line.\r\r |
| 128 | * <p>\r\r |
| 129 | * This is a very slow function. On my machine it takes quite a while to\r\r |
| 130 | * load the data in. I suspect that we could speed this up quite alot.\r\r |
| 131 | */\r\r |
| 132 | protected void createDictionary(BufferedReader in) throws IOException {\r\r |
| 133 | String line = "";\r\r |
| 134 | while (line != null) {\r\r |
| 135 | line = in.readLine();\r\r |
| 136 | if (line != null) {\r\r |
| 137 | line = new String(line.toCharArray());\r\r |
| 138 | putWord(line);\r\r |
| 139 | }\r\r |
| 140 | }\r\r |
| 141 | }\r\r |
| 142 | \r\r |
| 143 | /**\r\r |
| 144 | * Returns the code representing the word.\r\r |
| 145 | */\r\r |
| 146 | public String getCode(String word) {\r\r |
| 147 | return tf.transform(word);\r\r |
| 148 | }\r\r |
| 149 | \r\r |
| 150 | /**\r\r |
| 151 | * Allocates a word in the dictionary\r\r |
| 152 | */\r\r |
| 153 | protected void putWord(String word) {\r\r |
| 154 | String code = getCode(word);\r\r |
| 155 | LinkedList list = (LinkedList) mainDictionary.get(code);\r\r |
| 156 | if (list != null) {\r\r |
| 157 | list.add(word);\r\r |
| 158 | } else {\r\r |
| 159 | list = new LinkedList();\r\r |
| 160 | list.add(word);\r\r |
| 161 | mainDictionary.put(code, list);\r\r |
| 162 | }\r\r |
| 163 | }\r\r |
| 164 | \r\r |
| 165 | /**\r\r |
| 166 | * Returns a list of strings (words) for the code.\r\r |
| 167 | */\r\r |
| 168 | public LinkedList getWords(String code) {\r\r |
| 169 | //Check the main dictionary.\r\r |
| 170 | LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);\r\r |
| 171 | if (mainDictResult == null)\r\r |
| 172 | return new LinkedList();\r\r |
| 173 | return mainDictResult;\r\r |
| 174 | }\r\r |
| 175 | \r\r |
| 176 | /**\r\r |
| 177 | * Returns true if the word is correctly spelled against the current word list.\r\r |
| 178 | */\r\r |
| 179 | public boolean isCorrect(String word) {\r\r |
| 180 | LinkedList possible = getWords(getCode(word));\r\r |
| 181 | if (possible.contains(word))\r\r |
| 182 | return true;\r\r |
| 183 | //JMH should we always try the lowercase version. If I dont then capitalised\r\r |
| 184 | //words are always returned as incorrect.\r\r |
| 185 | else if (possible.contains(word.toLowerCase()))\r\r |
| 186 | return true;\r\r |
| 187 | return false;\r\r |
| 188 | }\r\r |
| 189 | \r\r |
| 190 | /**\r\r |
| 191 | * Returns a linked list of Word objects that are the suggestions to an\r\r |
| 192 | * incorrect word.\r\r |
| 193 | * <p>\r\r |
| 194 | * @param word Suggestions for given mispelt word\r\r |
| 195 | * @param threshold The lower boundary of similarity to mispelt word\r\r |
| 196 | * @return LinkedList a List of suggestions\r\r |
| 197 | */\r\r |
| 198 | public LinkedList getSuggestions(String word, int threshold) {\r\r |
| 199 | \r\r |
| 200 | HashSet nearmisscodes = new HashSet();\r\r |
| 201 | String code = getCode(word);\r\r |
| 202 | \r\r |
| 203 | // add all words that have the same codeword\r\r |
| 204 | nearmisscodes.add(code);\r\r |
| 205 | \r\r |
| 206 | // do some tranformations to pick up more results\r\r |
| 207 | //interchange \r\r |
| 208 | char[] charArray = word.toCharArray();\r\r |
| 209 | for (int i = 0; i < word.length() - 1; i++) {\r\r |
| 210 | char a = charArray[i];\r\r |
| 211 | char b = charArray[i + 1];\r\r |
| 212 | charArray[i] = b;\r\r |
| 213 | charArray[i + 1] = a;\r\r |
| 214 | nearmisscodes.add(getCode(new String(charArray)));\r\r |
| 215 | charArray[i] = a;\r\r |
| 216 | charArray[i + 1] = b;\r\r |
| 217 | }\r\r |
| 218 | //change\r\r |
| 219 | charArray = word.toCharArray();\r\r |
| 220 | for (int i = 0; i < word.length(); i++) {\r\r |
| 221 | char original = charArray[i];\r\r |
| 222 | for (int j = 0; j < replacelist.length; j++) {\r\r |
| 223 | charArray[i] = replacelist[j];\r\r |
| 224 | nearmisscodes.add(getCode(new String(charArray)));\r\r |
| 225 | }\r\r |
| 226 | charArray[i] = original;\r\r |
| 227 | }\r\r |
| 228 | //add\r\r |
| 229 | charArray = (word += " ").toCharArray();\r\r |
| 230 | int iy = charArray.length - 1;\r\r |
| 231 | while (true) {\r\r |
| 232 | for (int j = 0; j < replacelist.length; j++) {\r\r |
| 233 | charArray[iy] = replacelist[j];\r\r |
| 234 | nearmisscodes.add(getCode(new String(charArray)));\r\r |
| 235 | }\r\r |
| 236 | if (iy == 0)\r\r |
| 237 | break;\r\r |
| 238 | charArray[iy] = charArray[iy - 1];\r\r |
| 239 | --iy;\r\r |
| 240 | }\r\r |
| 241 | //delete\r\r |
| 242 | word = word.trim();\r\r |
| 243 | charArray = word.toCharArray();\r\r |
| 244 | char[] charArray2 = new char[charArray.length - 1];\r\r |
| 245 | for (int ix = 0; ix < charArray2.length; ix++) {\r\r |
| 246 | charArray2[ix] = charArray[ix];\r\r |
| 247 | }\r\r |
| 248 | char a, b;\r\r |
| 249 | a = charArray[charArray.length - 1];\r\r |
| 250 | int ii = charArray2.length;\r\r |
| 251 | while (true) {\r\r |
| 252 | nearmisscodes.add(getCode(new String(charArray)));\r\r |
| 253 | if (ii == 0)\r\r |
| 254 | break;\r\r |
| 255 | b = a;\r\r |
| 256 | a = charArray2[ii - 1];\r\r |
| 257 | charArray2[ii - 1] = b;\r\r |
| 258 | --ii;\r\r |
| 259 | }\r\r |
| 260 | \r\r |
| 261 | LinkedList wordlist = getWordsFromCode(word, nearmisscodes);\r\r |
| 262 | // We sort a linkedlist at the end instead of maintaining a\r\r |
| 263 | // continously sorted TreeSet because everytime you add a collection\r\r |
| 264 | // to a treeset it has to be resorted. It's better to do this operation\r\r |
| 265 | // once at the end.\r\r |
| 266 | Collections.sort( wordlist, new Word());\r\r |
| 267 | return wordlist;\r\r |
| 268 | }\r\r |
| 269 | \r\r |
| 270 | private LinkedList getWordsFromCode(String word, Collection codes) {\r\r |
| 271 | Configuration config = Configuration.getConfiguration();\r\r |
| 272 | LinkedList result = new LinkedList();\r\r |
| 273 | for (Iterator i = codes.iterator(); i.hasNext();) {\r\r |
| 274 | String code = (String) i.next();\r\r |
| 275 | LinkedList simwordlist = getWords(code);\r\r |
| 276 | for (Iterator j = simwordlist.iterator(); j.hasNext();) {\r\r |
| 277 | String similar = (String) j.next();\r\r |
| 278 | int distance = EditDistance.getDistance(word, similar);\r\r |
| 279 | if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {\r\r |
| 280 | Word w = new Word(similar, distance);\r\r |
| 281 | result.add(w);\r\r |
| 282 | }\r\r |
| 283 | }\r\r |
| 284 | }\r\r |
| 285 | return result;\r\r |
| 286 | }\r\r |
| 287 | \r\r |
| 288 | }\r\r |