Commit | Line | Data |
---|---|---|
6dd70280 JL |
1 | /*\r\r |
2 | * put your module comment here\r\r | |
3 | * formatted with JxBeauty (c) johann.langhofer@nextra.at\r\r | |
4 | */\r\r | |
5 | \r\r | |
6 | package com.swabunga.spell.engine;\r\r | |
7 | \r\r | |
8 | import java.io.*;\r\r | |
9 | import java.util.*;\r\r | |
10 | \r\r | |
11 | /**\r\r | |
12 | * The SpellDictionary class holds the instance of the dictionary.\r\r | |
13 | * <p>\r\r | |
14 | * This class is thread safe. Derived classes should ensure that this preserved.\r\r | |
15 | * </p>\r\r | |
16 | * <p>\r\r | |
17 | * There are many open source dictionary files. For just a few see:\r\r | |
18 | * http://wordlist.sourceforge.net/\r\r | |
19 | * </p>\r\r | |
20 | * <p>\r\r | |
21 | * This dictionary class reads words one per line. Make sure that your word list\r\r | |
22 | * is formatted in this way (most are).\r\r | |
23 | * </p>\r\r | |
24 | */\r\r | |
25 | public class SpellDictionary {\r\r | |
26 | \r\r | |
27 | \r\r | |
28 | /** The replace list is used in the getSuggestions method*/\r\r | |
29 | private static final char[] replacelist =\r\r | |
30 | {\r\r | |
31 | 'A',\r\r | |
32 | 'B',\r\r | |
33 | 'X',\r\r | |
34 | 'S',\r\r | |
35 | 'K',\r\r | |
36 | 'J',\r\r | |
37 | 'T',\r\r | |
38 | 'F',\r\r | |
39 | 'H',\r\r | |
40 | 'L',\r\r | |
41 | 'M',\r\r | |
42 | 'N',\r\r | |
43 | 'P',\r\r | |
44 | 'R',\r\r | |
45 | '0' };\r\r | |
46 | \r\r | |
47 | /** A field indicating the initial hash map capacity (16KB) for the main\r\r | |
48 | * dictionary hash map. Interested to see what the performance of a\r\r | |
49 | * smaller initial capacity is like.\r\r | |
50 | */\r\r | |
51 | private final static int INITIAL_CAPACITY = 16 * 1024;\r\r | |
52 | /**\r\r | |
53 | * The hashmap that contains the word dictionary. The map is hashed on the doublemeta\r\r | |
54 | * code. The map entry contains a LinkedList of words that have the same double meta code.\r\r | |
55 | */\r\r | |
56 | protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);\r\r | |
57 | /**The reference to a Transformator, used to transform a word into it's.\r\r | |
58 | * phonetic code.\r\r | |
59 | */\r\r | |
60 | private Transformator tf = null;\r\r | |
61 | \r\r | |
62 | \r\r | |
63 | /** Holds the dictionary file for appending*/\r\r | |
64 | private File dictFile = null;\r\r | |
65 | \r\r | |
66 | /**\r\r | |
67 | * Dictionary Constructor.\r\r | |
68 | */\r\r | |
69 | public SpellDictionary(Reader wordList) throws IOException {\r\r | |
70 | tf = new DoubleMeta();\r\r | |
71 | createDictionary(new BufferedReader(wordList));\r\r | |
72 | }\r\r | |
73 | \r\r | |
74 | /**\r\r | |
75 | * Dictionary Constructor for JAR files\r\r | |
76 | * @author Howard Kistler\r\r | |
77 | */\r\r | |
78 | public SpellDictionary(String wordListResource) throws IOException\r\r | |
79 | {\r\r | |
80 | tf = new DoubleMeta();\r\r | |
81 | InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);\r\r | |
82 | createDictionary(new BufferedReader(new InputStreamReader(is)));\r\r | |
83 | }\r\r | |
84 | \r\r | |
85 | /**\r\r | |
86 | * Dictionary Convienence Constructor.\r\r | |
87 | */\r\r | |
88 | public SpellDictionary(File wordList)\r\r | |
89 | throws FileNotFoundException, IOException {\r\r | |
90 | this(new FileReader(wordList));\r\r | |
91 | dictFile = wordList;\r\r | |
92 | }\r\r | |
93 | \r\r | |
94 | /**\r\r | |
95 | * Dictionary constructor that uses an aspell phonetic file to\r\r | |
96 | * build the transformation table.\r\r | |
97 | */\r\r | |
98 | public SpellDictionary(File wordList, File phonetic)\r\r | |
99 | throws FileNotFoundException, IOException {\r\r | |
100 | tf = new GenericTransformator(phonetic);\r\r | |
101 | dictFile = wordList;\r\r | |
102 | createDictionary(new BufferedReader(new FileReader(wordList)));\r\r | |
103 | }\r\r | |
104 | \r\r | |
105 | /**\r\r | |
106 | * Add a word permanantly to the dictionary (and the dictionary file).\r\r | |
107 | * <p>This needs to be made thread safe (synchronized)</p>\r\r | |
108 | */\r\r | |
109 | public void addWord(String word) {\r\r | |
110 | putWord(word);\r\r | |
111 | if (dictFile == null)\r\r | |
112 | return;\r\r | |
113 | try {\r\r | |
114 | FileWriter w = new FileWriter(dictFile.toString(), true);\r\r | |
115 | // Open with append.\r\r | |
116 | w.write(word);\r\r | |
117 | w.write("\n");\r\r | |
118 | w.close();\r\r | |
119 | } catch (IOException ex) {\r\r | |
120 | System.out.println("Error writing to dictionary file");\r\r | |
121 | }\r\r | |
122 | }\r\r | |
123 | \r\r | |
124 | /**\r\r | |
125 | * Constructs the dictionary from a word list file.\r\r | |
126 | * <p>\r\r | |
127 | * Each word in the reader should be on a seperate line.\r\r | |
128 | * <p>\r\r | |
129 | * This is a very slow function. On my machine it takes quite a while to\r\r | |
130 | * load the data in. I suspect that we could speed this up quite alot.\r\r | |
131 | */\r\r | |
132 | protected void createDictionary(BufferedReader in) throws IOException {\r\r | |
133 | String line = "";\r\r | |
134 | while (line != null) {\r\r | |
135 | line = in.readLine();\r\r | |
136 | if (line != null) {\r\r | |
137 | line = new String(line.toCharArray());\r\r | |
138 | putWord(line);\r\r | |
139 | }\r\r | |
140 | }\r\r | |
141 | }\r\r | |
142 | \r\r | |
143 | /**\r\r | |
144 | * Returns the code representing the word.\r\r | |
145 | */\r\r | |
146 | public String getCode(String word) {\r\r | |
147 | return tf.transform(word);\r\r | |
148 | }\r\r | |
149 | \r\r | |
150 | /**\r\r | |
151 | * Allocates a word in the dictionary\r\r | |
152 | */\r\r | |
153 | protected void putWord(String word) {\r\r | |
154 | String code = getCode(word);\r\r | |
155 | LinkedList list = (LinkedList) mainDictionary.get(code);\r\r | |
156 | if (list != null) {\r\r | |
157 | list.add(word);\r\r | |
158 | } else {\r\r | |
159 | list = new LinkedList();\r\r | |
160 | list.add(word);\r\r | |
161 | mainDictionary.put(code, list);\r\r | |
162 | }\r\r | |
163 | }\r\r | |
164 | \r\r | |
165 | /**\r\r | |
166 | * Returns a list of strings (words) for the code.\r\r | |
167 | */\r\r | |
168 | public LinkedList getWords(String code) {\r\r | |
169 | //Check the main dictionary.\r\r | |
170 | LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);\r\r | |
171 | if (mainDictResult == null)\r\r | |
172 | return new LinkedList();\r\r | |
173 | return mainDictResult;\r\r | |
174 | }\r\r | |
175 | \r\r | |
176 | /**\r\r | |
177 | * Returns true if the word is correctly spelled against the current word list.\r\r | |
178 | */\r\r | |
179 | public boolean isCorrect(String word) {\r\r | |
180 | LinkedList possible = getWords(getCode(word));\r\r | |
181 | if (possible.contains(word))\r\r | |
182 | return true;\r\r | |
183 | //JMH should we always try the lowercase version. If I dont then capitalised\r\r | |
184 | //words are always returned as incorrect.\r\r | |
185 | else if (possible.contains(word.toLowerCase()))\r\r | |
186 | return true;\r\r | |
187 | return false;\r\r | |
188 | }\r\r | |
189 | \r\r | |
190 | /**\r\r | |
191 | * Returns a linked list of Word objects that are the suggestions to an\r\r | |
192 | * incorrect word.\r\r | |
193 | * <p>\r\r | |
194 | * @param word Suggestions for given mispelt word\r\r | |
195 | * @param threshold The lower boundary of similarity to mispelt word\r\r | |
196 | * @return LinkedList a List of suggestions\r\r | |
197 | */\r\r | |
198 | public LinkedList getSuggestions(String word, int threshold) {\r\r | |
199 | \r\r | |
200 | HashSet nearmisscodes = new HashSet();\r\r | |
201 | String code = getCode(word);\r\r | |
202 | \r\r | |
203 | // add all words that have the same codeword\r\r | |
204 | nearmisscodes.add(code);\r\r | |
205 | \r\r | |
206 | // do some tranformations to pick up more results\r\r | |
207 | //interchange \r\r | |
208 | char[] charArray = word.toCharArray();\r\r | |
209 | for (int i = 0; i < word.length() - 1; i++) {\r\r | |
210 | char a = charArray[i];\r\r | |
211 | char b = charArray[i + 1];\r\r | |
212 | charArray[i] = b;\r\r | |
213 | charArray[i + 1] = a;\r\r | |
214 | nearmisscodes.add(getCode(new String(charArray)));\r\r | |
215 | charArray[i] = a;\r\r | |
216 | charArray[i + 1] = b;\r\r | |
217 | }\r\r | |
218 | //change\r\r | |
219 | charArray = word.toCharArray();\r\r | |
220 | for (int i = 0; i < word.length(); i++) {\r\r | |
221 | char original = charArray[i];\r\r | |
222 | for (int j = 0; j < replacelist.length; j++) {\r\r | |
223 | charArray[i] = replacelist[j];\r\r | |
224 | nearmisscodes.add(getCode(new String(charArray)));\r\r | |
225 | }\r\r | |
226 | charArray[i] = original;\r\r | |
227 | }\r\r | |
228 | //add\r\r | |
229 | charArray = (word += " ").toCharArray();\r\r | |
230 | int iy = charArray.length - 1;\r\r | |
231 | while (true) {\r\r | |
232 | for (int j = 0; j < replacelist.length; j++) {\r\r | |
233 | charArray[iy] = replacelist[j];\r\r | |
234 | nearmisscodes.add(getCode(new String(charArray)));\r\r | |
235 | }\r\r | |
236 | if (iy == 0)\r\r | |
237 | break;\r\r | |
238 | charArray[iy] = charArray[iy - 1];\r\r | |
239 | --iy;\r\r | |
240 | }\r\r | |
241 | //delete\r\r | |
242 | word = word.trim();\r\r | |
243 | charArray = word.toCharArray();\r\r | |
244 | char[] charArray2 = new char[charArray.length - 1];\r\r | |
245 | for (int ix = 0; ix < charArray2.length; ix++) {\r\r | |
246 | charArray2[ix] = charArray[ix];\r\r | |
247 | }\r\r | |
248 | char a, b;\r\r | |
249 | a = charArray[charArray.length - 1];\r\r | |
250 | int ii = charArray2.length;\r\r | |
251 | while (true) {\r\r | |
252 | nearmisscodes.add(getCode(new String(charArray)));\r\r | |
253 | if (ii == 0)\r\r | |
254 | break;\r\r | |
255 | b = a;\r\r | |
256 | a = charArray2[ii - 1];\r\r | |
257 | charArray2[ii - 1] = b;\r\r | |
258 | --ii;\r\r | |
259 | }\r\r | |
260 | \r\r | |
261 | LinkedList wordlist = getWordsFromCode(word, nearmisscodes);\r\r | |
262 | // We sort a linkedlist at the end instead of maintaining a\r\r | |
263 | // continously sorted TreeSet because everytime you add a collection\r\r | |
264 | // to a treeset it has to be resorted. It's better to do this operation\r\r | |
265 | // once at the end.\r\r | |
266 | Collections.sort( wordlist, new Word());\r\r | |
267 | return wordlist;\r\r | |
268 | }\r\r | |
269 | \r\r | |
270 | private LinkedList getWordsFromCode(String word, Collection codes) {\r\r | |
271 | Configuration config = Configuration.getConfiguration();\r\r | |
272 | LinkedList result = new LinkedList();\r\r | |
273 | for (Iterator i = codes.iterator(); i.hasNext();) {\r\r | |
274 | String code = (String) i.next();\r\r | |
275 | LinkedList simwordlist = getWords(code);\r\r | |
276 | for (Iterator j = simwordlist.iterator(); j.hasNext();) {\r\r | |
277 | String similar = (String) j.next();\r\r | |
278 | int distance = EditDistance.getDistance(word, similar);\r\r | |
279 | if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {\r\r | |
280 | Word w = new Word(similar, distance);\r\r | |
281 | result.add(w);\r\r | |
282 | }\r\r | |
283 | }\r\r | |
284 | }\r\r | |
285 | return result;\r\r | |
286 | }\r\r | |
287 | \r\r | |
288 | }\r\r |