Initial revision
[old-projects.git] / ekit / com / swabunga / spell / engine / SpellDictionary.java
1 /*
2 * put your module comment here
3 * formatted with JxBeauty (c) johann.langhofer@nextra.at
4 */
5
6 package com.swabunga.spell.engine;
7
8 import java.io.*;
9 import java.util.*;
10
11 /**
12 * The SpellDictionary class holds the instance of the dictionary.
13 * <p>
14 * This class is thread safe. Derived classes should ensure that this preserved.
15 * </p>
16 * <p>
17 * There are many open source dictionary files. For just a few see:
18 * http://wordlist.sourceforge.net/
19 * </p>
20 * <p>
21 * This dictionary class reads words one per line. Make sure that your word list
22 * is formatted in this way (most are).
23 * </p>
24 */
25 public class SpellDictionary {
26
27
28 /** The replace list is used in the getSuggestions method*/
29 private static final char[] replacelist =
30 {
31 'A',
32 'B',
33 'X',
34 'S',
35 'K',
36 'J',
37 'T',
38 'F',
39 'H',
40 'L',
41 'M',
42 'N',
43 'P',
44 'R',
45 '0' };
46
47 /** A field indicating the initial hash map capacity (16KB) for the main
48 * dictionary hash map. Interested to see what the performance of a
49 * smaller initial capacity is like.
50 */
51 private final static int INITIAL_CAPACITY = 16 * 1024;
52 /**
53 * The hashmap that contains the word dictionary. The map is hashed on the doublemeta
54 * code. The map entry contains a LinkedList of words that have the same double meta code.
55 */
56 protected HashMap mainDictionary = new HashMap(INITIAL_CAPACITY);
57 /**The reference to a Transformator, used to transform a word into it's.
58 * phonetic code.
59 */
60 private Transformator tf = null;
61
62
63 /** Holds the dictionary file for appending*/
64 private File dictFile = null;
65
66 /**
67 * Dictionary Constructor.
68 */
69 public SpellDictionary(Reader wordList) throws IOException {
70 tf = new DoubleMeta();
71 createDictionary(new BufferedReader(wordList));
72 }
73
74 /**
75 * Dictionary Constructor for JAR files
76 * @author Howard Kistler
77 */
78 public SpellDictionary(String wordListResource) throws IOException
79 {
80 tf = new DoubleMeta();
81 InputStream is = this.getClass().getResourceAsStream("dictionary/" + wordListResource);
82 createDictionary(new BufferedReader(new InputStreamReader(is)));
83 }
84
85 /**
86 * Dictionary Convienence Constructor.
87 */
88 public SpellDictionary(File wordList)
89 throws FileNotFoundException, IOException {
90 this(new FileReader(wordList));
91 dictFile = wordList;
92 }
93
94 /**
95 * Dictionary constructor that uses an aspell phonetic file to
96 * build the transformation table.
97 */
98 public SpellDictionary(File wordList, File phonetic)
99 throws FileNotFoundException, IOException {
100 tf = new GenericTransformator(phonetic);
101 dictFile = wordList;
102 createDictionary(new BufferedReader(new FileReader(wordList)));
103 }
104
105 /**
106 * Add a word permanantly to the dictionary (and the dictionary file).
107 * <p>This needs to be made thread safe (synchronized)</p>
108 */
109 public void addWord(String word) {
110 putWord(word);
111 if (dictFile == null)
112 return;
113 try {
114 FileWriter w = new FileWriter(dictFile.toString(), true);
115 // Open with append.
116 w.write(word);
117 w.write("\n");
118 w.close();
119 } catch (IOException ex) {
120 System.out.println("Error writing to dictionary file");
121 }
122 }
123
124 /**
125 * Constructs the dictionary from a word list file.
126 * <p>
127 * Each word in the reader should be on a seperate line.
128 * <p>
129 * This is a very slow function. On my machine it takes quite a while to
130 * load the data in. I suspect that we could speed this up quite alot.
131 */
132 protected void createDictionary(BufferedReader in) throws IOException {
133 String line = "";
134 while (line != null) {
135 line = in.readLine();
136 if (line != null) {
137 line = new String(line.toCharArray());
138 putWord(line);
139 }
140 }
141 }
142
143 /**
144 * Returns the code representing the word.
145 */
146 public String getCode(String word) {
147 return tf.transform(word);
148 }
149
150 /**
151 * Allocates a word in the dictionary
152 */
153 protected void putWord(String word) {
154 String code = getCode(word);
155 LinkedList list = (LinkedList) mainDictionary.get(code);
156 if (list != null) {
157 list.add(word);
158 } else {
159 list = new LinkedList();
160 list.add(word);
161 mainDictionary.put(code, list);
162 }
163 }
164
165 /**
166 * Returns a list of strings (words) for the code.
167 */
168 public LinkedList getWords(String code) {
169 //Check the main dictionary.
170 LinkedList mainDictResult = (LinkedList) mainDictionary.get(code);
171 if (mainDictResult == null)
172 return new LinkedList();
173 return mainDictResult;
174 }
175
176 /**
177 * Returns true if the word is correctly spelled against the current word list.
178 */
179 public boolean isCorrect(String word) {
180 LinkedList possible = getWords(getCode(word));
181 if (possible.contains(word))
182 return true;
183 //JMH should we always try the lowercase version. If I dont then capitalised
184 //words are always returned as incorrect.
185 else if (possible.contains(word.toLowerCase()))
186 return true;
187 return false;
188 }
189
190 /**
191 * Returns a linked list of Word objects that are the suggestions to an
192 * incorrect word.
193 * <p>
194 * @param word Suggestions for given mispelt word
195 * @param threshold The lower boundary of similarity to mispelt word
196 * @return LinkedList a List of suggestions
197 */
198 public LinkedList getSuggestions(String word, int threshold) {
199
200 HashSet nearmisscodes = new HashSet();
201 String code = getCode(word);
202
203 // add all words that have the same codeword
204 nearmisscodes.add(code);
205
206 // do some tranformations to pick up more results
207 //interchange
208 char[] charArray = word.toCharArray();
209 for (int i = 0; i < word.length() - 1; i++) {
210 char a = charArray[i];
211 char b = charArray[i + 1];
212 charArray[i] = b;
213 charArray[i + 1] = a;
214 nearmisscodes.add(getCode(new String(charArray)));
215 charArray[i] = a;
216 charArray[i + 1] = b;
217 }
218 //change
219 charArray = word.toCharArray();
220 for (int i = 0; i < word.length(); i++) {
221 char original = charArray[i];
222 for (int j = 0; j < replacelist.length; j++) {
223 charArray[i] = replacelist[j];
224 nearmisscodes.add(getCode(new String(charArray)));
225 }
226 charArray[i] = original;
227 }
228 //add
229 charArray = (word += " ").toCharArray();
230 int iy = charArray.length - 1;
231 while (true) {
232 for (int j = 0; j < replacelist.length; j++) {
233 charArray[iy] = replacelist[j];
234 nearmisscodes.add(getCode(new String(charArray)));
235 }
236 if (iy == 0)
237 break;
238 charArray[iy] = charArray[iy - 1];
239 --iy;
240 }
241 //delete
242 word = word.trim();
243 charArray = word.toCharArray();
244 char[] charArray2 = new char[charArray.length - 1];
245 for (int ix = 0; ix < charArray2.length; ix++) {
246 charArray2[ix] = charArray[ix];
247 }
248 char a, b;
249 a = charArray[charArray.length - 1];
250 int ii = charArray2.length;
251 while (true) {
252 nearmisscodes.add(getCode(new String(charArray)));
253 if (ii == 0)
254 break;
255 b = a;
256 a = charArray2[ii - 1];
257 charArray2[ii - 1] = b;
258 --ii;
259 }
260
261 LinkedList wordlist = getWordsFromCode(word, nearmisscodes);
262 // We sort a linkedlist at the end instead of maintaining a
263 // continously sorted TreeSet because everytime you add a collection
264 // to a treeset it has to be resorted. It's better to do this operation
265 // once at the end.
266 Collections.sort( wordlist, new Word());
267 return wordlist;
268 }
269
270 private LinkedList getWordsFromCode(String word, Collection codes) {
271 Configuration config = Configuration.getConfiguration();
272 LinkedList result = new LinkedList();
273 for (Iterator i = codes.iterator(); i.hasNext();) {
274 String code = (String) i.next();
275 LinkedList simwordlist = getWords(code);
276 for (Iterator j = simwordlist.iterator(); j.hasNext();) {
277 String similar = (String) j.next();
278 int distance = EditDistance.getDistance(word, similar);
279 if (distance < config.getInteger(Configuration.SPELL_THRESHOLD)) {
280 Word w = new Word(similar, distance);
281 result.add(w);
282 }
283 }
284 }
285 return result;
286 }
287
288 }