Initial revision
[old-projects.git] / ekit / com / swabunga / spell / engine / SpellDictionary.java
CommitLineData
6dd70280
JL
1/*\r\r
2 * put your module comment here\r\r
3 * formatted with JxBeauty (c) johann.langhofer@nextra.at\r\r
4 */\r\r
5\r\r
6package com.swabunga.spell.engine;\r\r
7\r\r
8import java.io.*;\r\r
9import java.util.*;\r\r
10\r\r
11/**\r\r
12 * The SpellDictionary class holds the instance of the dictionary.\r\r
13 * <p>\r\r
14 * This class is thread safe. Derived classes should ensure that this preserved.\r\r
15 * </p>\r\r
16 * <p>\r\r
17 * There are many open source dictionary files. For just a few see:\r\r
18 * http://wordlist.sourceforge.net/\r\r
19 * </p>\r\r
20 * <p>\r\r
21 * This dictionary class reads words one per line. Make sure that your word list\r\r
22 * is formatted in this way (most are).\r\r
23 * </p>\r\r
24 */\r\r
25public class SpellDictionary {\r\r
26\r\r
27\r\r
28 /** The replace list is used in the getSuggestions method*/\r\r
29 private static final char[] replacelist =\r\r
30 {\r\r
31 'A',\r\r
32 'B',\r\r
33 'X',\r\r
34 'S',\r\r
35 'K',\r\r
36 'J',\r\r
37 'T',\r\r
38 'F',\r\r
39 'H',\r\r
40 'L',\r\r
41 'M',\r\r
42 'N',\r\r
43 'P',\r\r
44 'R',\r\r
45 '0' };\r\r
46\r\r
47 /** A field indicating the initial hash map capacity (16KB) for the main\r\r
48 * dictionary hash map. Interested to see what the performance of a\r\r
49 * smaller initial capacity is like.\r\r
50 */\r\r
51 private final static int INITIAL_CAPACITY = 16 * 1024;\r\r
52 /**\r\r
53 * The hashmap that contains the word dictionary. The map is hashed on the doublemeta\r\r
54 * code. The map entry contains a LinkedList of words that have the same double meta code.\r\r
55 */\r\r
56 protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);\r\r
57 /**The reference to a Transformator, used to transform a word into it's.\r\r
58 * phonetic code.\r\r
59 */\r\r
60 private Transformator tf = null;\r\r
61\r\r
62\r\r
63 /** Holds the dictionary file for appending*/\r\r
64 private File dictFile = null;\r\r
65\r\r
66 /**\r\r
67 * Dictionary Constructor.\r\r
68 */\r\r
69 public SpellDictionary(Reader wordList) throws IOException {\r\r
70 tf = new DoubleMeta();\r\r
71 createDictionary(new BufferedReader(wordList));\r\r
72 }\r\r
73\r\r
74 /**\r\r
75 * Dictionary Constructor for JAR files\r\r
76 * @author Howard Kistler\r\r
77 */\r\r
78 public SpellDictionary(String wordListResource) throws IOException\r\r
79 {\r\r
80 tf = new DoubleMeta();\r\r
81 InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);\r\r
82 createDictionary(new BufferedReader(new InputStreamReader(is)));\r\r
83 }\r\r
84\r\r
85 /**\r\r
86 * Dictionary Convienence Constructor.\r\r
87 */\r\r
88 public SpellDictionary(File wordList)\r\r
89 throws FileNotFoundException, IOException {\r\r
90 this(new FileReader(wordList));\r\r
91 dictFile = wordList;\r\r
92 }\r\r
93\r\r
94 /**\r\r
95 * Dictionary constructor that uses an aspell phonetic file to\r\r
96 * build the transformation table.\r\r
97 */\r\r
98 public SpellDictionary(File wordList, File phonetic)\r\r
99 throws FileNotFoundException, IOException {\r\r
100 tf = new GenericTransformator(phonetic);\r\r
101 dictFile = wordList;\r\r
102 createDictionary(new BufferedReader(new FileReader(wordList)));\r\r
103 }\r\r
104\r\r
105 /**\r\r
106 * Add a word permanantly to the dictionary (and the dictionary file).\r\r
107 * <p>This needs to be made thread safe (synchronized)</p>\r\r
108 */\r\r
109 public void addWord(String word) {\r\r
110 putWord(word);\r\r
111 if (dictFile == null)\r\r
112 return;\r\r
113 try {\r\r
114 FileWriter w = new FileWriter(dictFile.toString(), true);\r\r
115 // Open with append.\r\r
116 w.write(word);\r\r
117 w.write("\n");\r\r
118 w.close();\r\r
119 } catch (IOException ex) {\r\r
120 System.out.println("Error writing to dictionary file");\r\r
121 }\r\r
122 }\r\r
123\r\r
124 /**\r\r
125 * Constructs the dictionary from a word list file.\r\r
126 * <p>\r\r
127 * Each word in the reader should be on a seperate line.\r\r
128 * <p>\r\r
129 * This is a very slow function. On my machine it takes quite a while to\r\r
130 * load the data in. I suspect that we could speed this up quite alot.\r\r
131 */\r\r
132 protected void createDictionary(BufferedReader in) throws IOException {\r\r
133 String line = "";\r\r
134 while (line != null) {\r\r
135 line = in.readLine();\r\r
136 if (line != null) {\r\r
137 line = new String(line.toCharArray());\r\r
138 putWord(line);\r\r
139 }\r\r
140 }\r\r
141 }\r\r
142\r\r
143 /**\r\r
144 * Returns the code representing the word.\r\r
145 */\r\r
146 public String getCode(String word) {\r\r
147 return tf.transform(word);\r\r
148 }\r\r
149\r\r
150 /**\r\r
151 * Allocates a word in the dictionary\r\r
152 */\r\r
153 protected void putWord(String word) {\r\r
154 String code = getCode(word);\r\r
155 LinkedList list = (LinkedList) mainDictionary.get(code);\r\r
156 if (list != null) {\r\r
157 list.add(word);\r\r
158 } else {\r\r
159 list = new LinkedList();\r\r
160 list.add(word);\r\r
161 mainDictionary.put(code, list);\r\r
162 }\r\r
163 }\r\r
164\r\r
165 /**\r\r
166 * Returns a list of strings (words) for the code.\r\r
167 */\r\r
168 public LinkedList getWords(String code) {\r\r
169 //Check the main dictionary.\r\r
170 LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);\r\r
171 if (mainDictResult == null)\r\r
172 return new LinkedList();\r\r
173 return mainDictResult;\r\r
174 }\r\r
175\r\r
176 /**\r\r
177 * Returns true if the word is correctly spelled against the current word list.\r\r
178 */\r\r
179 public boolean isCorrect(String word) {\r\r
180 LinkedList possible = getWords(getCode(word));\r\r
181 if (possible.contains(word))\r\r
182 return true;\r\r
183 //JMH should we always try the lowercase version. If I dont then capitalised\r\r
184 //words are always returned as incorrect.\r\r
185 else if (possible.contains(word.toLowerCase()))\r\r
186 return true;\r\r
187 return false;\r\r
188 }\r\r
189\r\r
190 /**\r\r
191 * Returns a linked list of Word objects that are the suggestions to an\r\r
192 * incorrect word.\r\r
193 * <p>\r\r
194 * @param word Suggestions for given mispelt word\r\r
195 * @param threshold The lower boundary of similarity to mispelt word\r\r
196 * @return LinkedList a List of suggestions\r\r
197 */\r\r
198 public LinkedList getSuggestions(String word, int threshold) {\r\r
199\r\r
200 HashSet nearmisscodes = new HashSet();\r\r
201 String code = getCode(word);\r\r
202\r\r
203 // add all words that have the same codeword\r\r
204 nearmisscodes.add(code);\r\r
205\r\r
206 // do some tranformations to pick up more results\r\r
207 //interchange \r\r
208 char[] charArray = word.toCharArray();\r\r
209 for (int i = 0; i < word.length() - 1; i++) {\r\r
210 char a = charArray[i];\r\r
211 char b = charArray[i + 1];\r\r
212 charArray[i] = b;\r\r
213 charArray[i + 1] = a;\r\r
214 nearmisscodes.add(getCode(new String(charArray)));\r\r
215 charArray[i] = a;\r\r
216 charArray[i + 1] = b;\r\r
217 }\r\r
218 //change\r\r
219 charArray = word.toCharArray();\r\r
220 for (int i = 0; i < word.length(); i++) {\r\r
221 char original = charArray[i];\r\r
222 for (int j = 0; j < replacelist.length; j++) {\r\r
223 charArray[i] = replacelist[j];\r\r
224 nearmisscodes.add(getCode(new String(charArray)));\r\r
225 }\r\r
226 charArray[i] = original;\r\r
227 }\r\r
228 //add\r\r
229 charArray = (word += " ").toCharArray();\r\r
230 int iy = charArray.length - 1;\r\r
231 while (true) {\r\r
232 for (int j = 0; j < replacelist.length; j++) {\r\r
233 charArray[iy] = replacelist[j];\r\r
234 nearmisscodes.add(getCode(new String(charArray)));\r\r
235 }\r\r
236 if (iy == 0)\r\r
237 break;\r\r
238 charArray[iy] = charArray[iy - 1];\r\r
239 --iy;\r\r
240 }\r\r
241 //delete\r\r
242 word = word.trim();\r\r
243 charArray = word.toCharArray();\r\r
244 char[] charArray2 = new char[charArray.length - 1];\r\r
245 for (int ix = 0; ix < charArray2.length; ix++) {\r\r
246 charArray2[ix] = charArray[ix];\r\r
247 }\r\r
248 char a, b;\r\r
249 a = charArray[charArray.length - 1];\r\r
250 int ii = charArray2.length;\r\r
251 while (true) {\r\r
252 nearmisscodes.add(getCode(new String(charArray)));\r\r
253 if (ii == 0)\r\r
254 break;\r\r
255 b = a;\r\r
256 a = charArray2[ii - 1];\r\r
257 charArray2[ii - 1] = b;\r\r
258 --ii;\r\r
259 }\r\r
260\r\r
261 LinkedList wordlist = getWordsFromCode(word, nearmisscodes);\r\r
262 // We sort a linkedlist at the end instead of maintaining a\r\r
263 // continously sorted TreeSet because everytime you add a collection\r\r
264 // to a treeset it has to be resorted. It's better to do this operation\r\r
265 // once at the end.\r\r
266 Collections.sort( wordlist, new Word());\r\r
267 return wordlist;\r\r
268 }\r\r
269\r\r
270 private LinkedList getWordsFromCode(String word, Collection codes) {\r\r
271 Configuration config = Configuration.getConfiguration();\r\r
272 LinkedList result = new LinkedList();\r\r
273 for (Iterator i = codes.iterator(); i.hasNext();) {\r\r
274 String code = (String) i.next();\r\r
275 LinkedList simwordlist = getWords(code);\r\r
276 for (Iterator j = simwordlist.iterator(); j.hasNext();) {\r\r
277 String similar = (String) j.next();\r\r
278 int distance = EditDistance.getDistance(word, similar);\r\r
279 if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {\r\r
280 Word w = new Word(similar, distance);\r\r
281 result.add(w);\r\r
282 }\r\r
283 }\r\r
284 }\r\r
285 return result;\r\r
286 }\r\r
287\r\r
288}\r\r