Initial revision
[old-projects.git] / ekit / com / swabunga / spell / event / StringWordTokenizer.java
1 package com.swabunga.spell.event;
2
3 import java.util.*;
4 import java.text.*;
5
6 /** This class tokenizes a input string.
7 * <p>
8 * It also allows for the string to be mutated. The result after the spell
9 * checking is completed is available to the call to getFinalText</p>
10 *
11 * @author Jason Height (jheight@chariot.net.au)
12 */
13 public class StringWordTokenizer implements WordTokenizer {
14 /** Holds the start character position of the current word*/
15 private int currentWordPos = 0;
16 /** Holds the end character position of the current word*/
17 private int currentWordEnd = 0;
18 /** Holds the start character position of the next word*/
19 private int nextWordPos = -1;
20 /** The actual text that is being tokenized*/
21 private StringBuffer text;
22 /** The cumulative word count that have been processed*/
23 private int wordCount = 0;
24 /** Flag indicating if there are any more tokens (words) left*/
25 private boolean moreTokens = true;
26 /** Is this a special case where the currentWordStart, currntWordEnd and
27 * nextWordPos have already been calculated. (see nextWord)
28 */
29 private boolean first = true;
30
31 private BreakIterator sentanceIterator;
32 private boolean startsSentance = true;
33
34
35 public StringWordTokenizer(String text) {
36 sentanceIterator = BreakIterator.getSentenceInstance();
37 sentanceIterator.setText(text);
38 sentanceIterator.first();
39 //Wrap a string buffer to hopefully make things a bit easier and efficient to
40 //replace words etc.
41 this.text = new StringBuffer(text);
42 currentWordPos = getNextWordStart(this.text, 0);
43 //If the current word pos is -1 then the string was all white space
44 if (currentWordPos != -1) {
45 currentWordEnd = getNextWordEnd(this.text, currentWordPos);
46 nextWordPos = getNextWordStart(this.text, currentWordEnd);
47 } else {
48 moreTokens = false;
49 }
50 }
51
52 /** This helper method will return the start character of the next
53 * word in the buffer from the start position
54 */
55 private static int getNextWordStart(StringBuffer text, int startPos) {
56 int size = text.length();
57 for (int i=startPos;i<size;i++) {
58 if (Character.isLetterOrDigit(text.charAt(i))) {
59 return i;
60 }
61 }
62 return -1;
63 }
64
65 /** This helper method will return the end of the next word in the buffer.
66 *
67 */
68 private static int getNextWordEnd(StringBuffer text, int startPos) {
69 int size = text.length();
70 for (int i=startPos;i<size;i++) {
71 if (!Character.isLetterOrDigit(text.charAt(i))) {
72 return i;
73 }
74 }
75 return size;
76 }
77
78
79 /** Returns true if there are more words that can be processed in the string
80 *
81 */
82 public boolean hasMoreWords() {
83 return moreTokens;
84 }
85
86 /** Returns the current character position in the text
87 *
88 */
89 public int getCurrentWordPosition() {
90 return currentWordPos;
91 }
92
93 /** Returns the current end word position in the text
94 *
95 */
96 public int getCurrentWordEnd() {
97 return currentWordEnd;
98 }
99
100 /** Returns the next word in the text
101 *
102 */
103 public String nextWord() {
104 if (!first) {
105 currentWordPos = nextWordPos;
106 currentWordEnd = getNextWordEnd(text, currentWordPos);
107 nextWordPos = getNextWordStart(text, currentWordEnd+1);
108 int current = sentanceIterator.current();
109 if (current == currentWordPos)
110 startsSentance = true;
111 else {
112 startsSentance = false;
113 if (currentWordEnd > current)
114 sentanceIterator.next();
115 }
116 }
117 //The nextWordPos has already been populated
118 String word = text.substring(currentWordPos, currentWordEnd);
119 wordCount++;
120 first = false;
121 if (nextWordPos == -1)
122 moreTokens = false;
123 return word;
124 }
125
126 /** Returns the current number of words that have been processed
127 *
128 */
129 public int getCurrentWordCount() {
130 return wordCount;
131 }
132
133 /** Replaces the current word token*/
134 public void replaceWord(String newWord) {
135 if (currentWordPos != -1) {
136 text.replace(currentWordPos, currentWordEnd, newWord);
137 //Position after the newly replaced word(s)
138 first = true;
139 currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());
140 if (currentWordPos != -1) {
141 currentWordEnd = getNextWordEnd(text, currentWordPos);
142 nextWordPos = getNextWordStart(text, currentWordEnd);
143 sentanceIterator.setText(text.toString());
144 sentanceIterator.following(currentWordPos);
145 } else moreTokens = false;
146 }
147 }
148
149 /** returns true iif the current word is at the start of a sentance*/
150 public boolean isNewSentance() {
151 return startsSentance;
152 }
153
154 /** Returns the current text that is being tokenized (includes any changes
155 * that have been made)
156 */
157 public String getContext() {
158 return text.toString();
159 }
160
161 /** This method can be used to return the final text after the schecking is complete.*/
162 public String getFinalText() {
163 return getContext();
164 }
165
166
167 public static void main(String args[]) {
168 StringWordTokenizer t = new StringWordTokenizer(" This is a test problem");
169 while(t.hasMoreWords()) {
170 String word = t.nextWord();
171 System.out.println("Word is '"+word+"'");
172 if ("test".equals(word)) t.replaceWord("mightly big");
173 }
174 System.out.println("End text is: '"+t.getFinalText()+"'");
175
176 t = new StringWordTokenizer(" README ");
177 while(t.hasMoreWords()) {
178 String word = t.nextWord();
179 System.out.println("Word is '"+word+"'");
180 }
181 System.out.println("End text is: '"+t.getFinalText()+"'");
182
183 t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");
184 while(t.hasMoreWords()) {
185 String word = t.nextWord();
186 System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());
187 if (word.equals("acronym"))
188 t.replaceWord("test");
189 }
190 }
191 }