2 * put your module comment here
3 * formatted with JxBeauty (c) johann.langhofer@nextra.at
6 package com
.swabunga
.spell
.engine
;
12 * The SpellDictionary class holds the instance of the dictionary.
14 * This class is thread safe. Derived classes should ensure that this preserved.
17 * There are many open source dictionary files. For just a few see:
18 * http://wordlist.sourceforge.net/
21 * This dictionary class reads words one per line. Make sure that your word list
22 * is formatted in this way (most are).
25 public class SpellDictionary
{
28 /** The replace list is used in the getSuggestions method*/
29 private static final char[] replacelist
=
47 /** A field indicating the initial hash map capacity (16KB) for the main
48 * dictionary hash map. Interested to see what the performance of a
49 * smaller initial capacity is like.
51 private final static int INITIAL_CAPACITY
= 16 * 1024;
53 * The hashmap that contains the word dictionary. The map is hashed on the doublemeta
54 * code. The map entry contains a LinkedList of words that have the same double meta code.
56 protected HashMap mainDictionary
= new HashMap(INITIAL_CAPACITY
);
57 /**The reference to a Transformator, used to transform a word into it's.
60 private Transformator tf
= null
;
63 /** Holds the dictionary file for appending*/
64 private File dictFile
= null
;
67 * Dictionary Constructor.
69 public SpellDictionary(Reader wordList
) throws IOException
{
70 tf
= new DoubleMeta();
71 createDictionary(new BufferedReader(wordList
));
75 * Dictionary Constructor for JAR files
76 * @author Howard Kistler
78 public SpellDictionary(String wordListResource
) throws IOException
80 tf
= new DoubleMeta();
81 InputStream is
= this.getClass().getResourceAsStream("dictionary/" + wordListResource
);
82 createDictionary(new BufferedReader(new InputStreamReader(is
)));
86 * Dictionary Convienence Constructor.
88 public SpellDictionary(File wordList
)
89 throws FileNotFoundException
, IOException
{
90 this(new FileReader(wordList
));
95 * Dictionary constructor that uses an aspell phonetic file to
96 * build the transformation table.
98 public SpellDictionary(File wordList
, File phonetic
)
99 throws FileNotFoundException
, IOException
{
100 tf
= new GenericTransformator(phonetic
);
102 createDictionary(new BufferedReader(new FileReader(wordList
)));
106 * Add a word permanantly to the dictionary (and the dictionary file).
107 * <p>This needs to be made thread safe (synchronized)</p>
109 public void addWord(String word
) {
111 if (dictFile
== null
)
114 FileWriter w
= new FileWriter(dictFile
.toString(), true
);
119 } catch (IOException ex
) {
120 System
.out
.println("Error writing to dictionary file");
125 * Constructs the dictionary from a word list file.
127 * Each word in the reader should be on a seperate line.
129 * This is a very slow function. On my machine it takes quite a while to
130 * load the data in. I suspect that we could speed this up quite alot.
132 protected void createDictionary(BufferedReader
in) throws IOException
{
134 while (line
!= null
) {
135 line
= in.readLine();
137 line
= new String(line
.toCharArray());
144 * Returns the code representing the word.
146 public String
getCode(String word
) {
147 return tf
.transform(word
);
151 * Allocates a word in the dictionary
153 protected void putWord(String word
) {
154 String code
= getCode(word
);
155 LinkedList list
= (LinkedList
) mainDictionary
.get(code
);
159 list
= new LinkedList();
161 mainDictionary
.put(code
, list
);
166 * Returns a list of strings (words) for the code.
168 public LinkedList
getWords(String code
) {
169 //Check the main dictionary.
170 LinkedList mainDictResult
= (LinkedList
) mainDictionary
.get(code
);
171 if (mainDictResult
== null
)
172 return new LinkedList();
173 return mainDictResult
;
177 * Returns true if the word is correctly spelled against the current word list.
179 public boolean isCorrect(String word
) {
180 LinkedList possible
= getWords(getCode(word
));
181 if (possible
.contains(word
))
183 //JMH should we always try the lowercase version. If I dont then capitalised
184 //words are always returned as incorrect.
185 else if (possible
.contains(word
.toLowerCase()))
191 * Returns a linked list of Word objects that are the suggestions to an
194 * @param word Suggestions for given mispelt word
195 * @param threshold The lower boundary of similarity to mispelt word
196 * @return LinkedList a List of suggestions
198 public LinkedList
getSuggestions(String word
, int threshold
) {
200 HashSet nearmisscodes
= new HashSet();
201 String code
= getCode(word
);
203 // add all words that have the same codeword
204 nearmisscodes
.add(code
);
206 // do some tranformations to pick up more results
208 char[] charArray
= word
.toCharArray();
209 for (int i
= 0; i
< word
.length() - 1; i
++) {
210 char a
= charArray
[i
];
211 char b
= charArray
[i
+ 1];
213 charArray
[i
+ 1] = a
;
214 nearmisscodes
.add(getCode(new String(charArray
)));
216 charArray
[i
+ 1] = b
;
219 charArray
= word
.toCharArray();
220 for (int i
= 0; i
< word
.length(); i
++) {
221 char original
= charArray
[i
];
222 for (int j
= 0; j
< replacelist
.length
; j
++) {
223 charArray
[i
] = replacelist
[j
];
224 nearmisscodes
.add(getCode(new String(charArray
)));
226 charArray
[i
] = original
;
229 charArray
= (word
+= " ").toCharArray();
230 int iy
= charArray
.length
- 1;
232 for (int j
= 0; j
< replacelist
.length
; j
++) {
233 charArray
[iy
] = replacelist
[j
];
234 nearmisscodes
.add(getCode(new String(charArray
)));
238 charArray
[iy
] = charArray
[iy
- 1];
243 charArray
= word
.toCharArray();
244 char[] charArray2
= new char[charArray
.length
- 1];
245 for (int ix
= 0; ix
< charArray2
.length
; ix
++) {
246 charArray2
[ix
] = charArray
[ix
];
249 a
= charArray
[charArray
.length
- 1];
250 int ii
= charArray2
.length
;
252 nearmisscodes
.add(getCode(new String(charArray
)));
256 a
= charArray2
[ii
- 1];
257 charArray2
[ii
- 1] = b
;
261 LinkedList wordlist
= getWordsFromCode(word
, nearmisscodes
);
262 // We sort a linkedlist at the end instead of maintaining a
263 // continously sorted TreeSet because everytime you add a collection
264 // to a treeset it has to be resorted. It's better to do this operation
266 Collections
.sort( wordlist
, new Word());
270 private LinkedList
getWordsFromCode(String word
, Collection codes
) {
271 Configuration config
= Configuration
.getConfiguration();
272 LinkedList result
= new LinkedList();
273 for (Iterator i
= codes
.iterator(); i
.hasNext();) {
274 String code
= (String
) i
.next();
275 LinkedList simwordlist
= getWords(code
);
276 for (Iterator j
= simwordlist
.iterator(); j
.hasNext();) {
277 String similar
= (String
) j
.next();
278 int distance
= EditDistance
.getDistance(word
, similar
);
279 if (distance
< config
.getInteger(Configuration
.SPELL_THRESHOLD
)) {
280 Word w
= new Word(similar
, distance
);