Initial revision
[old-projects.git] / ekit / com / swabunga / spell / event / StringWordTokenizer.java
CommitLineData
6dd70280
JL
1package com.swabunga.spell.event;\r\r
2\r\r
3import java.util.*;\r\r
4import java.text.*;\r\r
5\r\r
6/** This class tokenizes a input string.\r\r
7 * <p>\r\r
8 * It also allows for the string to be mutated. The result after the spell\r\r
9 * checking is completed is available to the call to getFinalText</p>\r\r
10 *\r\r
11 * @author Jason Height (jheight@chariot.net.au)\r\r
12 */\r\r
13public class StringWordTokenizer implements WordTokenizer {\r\r
14 /** Holds the start character position of the current word*/\r\r
15 private int currentWordPos = 0;\r\r
16 /** Holds the end character position of the current word*/\r\r
17 private int currentWordEnd = 0;\r\r
18 /** Holds the start character position of the next word*/\r\r
19 private int nextWordPos = -1;\r\r
20 /** The actual text that is being tokenized*/\r\r
21 private StringBuffer text;\r\r
22 /** The cumulative word count that have been processed*/\r\r
23 private int wordCount = 0;\r\r
24 /** Flag indicating if there are any more tokens (words) left*/\r\r
25 private boolean moreTokens = true;\r\r
26 /** Is this a special case where the currentWordStart, currntWordEnd and\r\r
27 * nextWordPos have already been calculated. (see nextWord)\r\r
28 */\r\r
29 private boolean first = true;\r\r
30\r\r
31 private BreakIterator sentanceIterator;\r\r
32 private boolean startsSentance = true;\r\r
33\r\r
34\r\r
35 public StringWordTokenizer(String text) {\r\r
36 sentanceIterator = BreakIterator.getSentenceInstance();\r\r
37 sentanceIterator.setText(text);\r\r
38 sentanceIterator.first();\r\r
39 //Wrap a string buffer to hopefully make things a bit easier and efficient to\r\r
40 //replace words etc.\r\r
41 this.text = new StringBuffer(text);\r\r
42 currentWordPos = getNextWordStart(this.text, 0);\r\r
43 //If the current word pos is -1 then the string was all white space\r\r
44 if (currentWordPos != -1) {\r\r
45 currentWordEnd = getNextWordEnd(this.text, currentWordPos);\r\r
46 nextWordPos = getNextWordStart(this.text, currentWordEnd);\r\r
47 } else {\r\r
48 moreTokens = false;\r\r
49 }\r\r
50 }\r\r
51\r\r
52 /** This helper method will return the start character of the next\r\r
53 * word in the buffer from the start position\r\r
54 */\r\r
55 private static int getNextWordStart(StringBuffer text, int startPos) {\r\r
56 int size = text.length();\r\r
57 for (int i=startPos;i<size;i++) {\r\r
58 if (Character.isLetterOrDigit(text.charAt(i))) {\r\r
59 return i;\r\r
60 }\r\r
61 }\r\r
62 return -1;\r\r
63 }\r\r
64\r\r
65 /** This helper method will return the end of the next word in the buffer.\r\r
66 *\r\r
67 */\r\r
68 private static int getNextWordEnd(StringBuffer text, int startPos) {\r\r
69 int size = text.length();\r\r
70 for (int i=startPos;i<size;i++) {\r\r
71 if (!Character.isLetterOrDigit(text.charAt(i))) {\r\r
72 return i;\r\r
73 }\r\r
74 }\r\r
75 return size;\r\r
76 }\r\r
77\r\r
78\r\r
79 /** Returns true if there are more words that can be processed in the string\r\r
80 *\r\r
81 */\r\r
82 public boolean hasMoreWords() {\r\r
83 return moreTokens;\r\r
84 }\r\r
85\r\r
86 /** Returns the current character position in the text\r\r
87 *\r\r
88 */\r\r
89 public int getCurrentWordPosition() {\r\r
90 return currentWordPos;\r\r
91 }\r\r
92\r\r
93 /** Returns the current end word position in the text\r\r
94 *\r\r
95 */\r\r
96 public int getCurrentWordEnd() {\r\r
97 return currentWordEnd;\r\r
98 }\r\r
99\r\r
100 /** Returns the next word in the text\r\r
101 *\r\r
102 */\r\r
103 public String nextWord() {\r\r
104 if (!first) {\r\r
105 currentWordPos = nextWordPos;\r\r
106 currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
107 nextWordPos = getNextWordStart(text, currentWordEnd+1);\r\r
108 int current = sentanceIterator.current();\r\r
109 if (current == currentWordPos)\r\r
110 startsSentance = true;\r\r
111 else {\r\r
112 startsSentance = false;\r\r
113 if (currentWordEnd > current)\r\r
114 sentanceIterator.next();\r\r
115 }\r\r
116 }\r\r
117 //The nextWordPos has already been populated\r\r
118 String word = text.substring(currentWordPos, currentWordEnd);\r\r
119 wordCount++;\r\r
120 first = false;\r\r
121 if (nextWordPos == -1)\r\r
122 moreTokens = false;\r\r
123 return word;\r\r
124 }\r\r
125\r\r
126 /** Returns the current number of words that have been processed\r\r
127 *\r\r
128 */\r\r
129 public int getCurrentWordCount() {\r\r
130 return wordCount;\r\r
131 }\r\r
132\r\r
133 /** Replaces the current word token*/\r\r
134 public void replaceWord(String newWord) {\r\r
135 if (currentWordPos != -1) {\r\r
136 text.replace(currentWordPos, currentWordEnd, newWord);\r\r
137 //Position after the newly replaced word(s)\r\r
138 first = true;\r\r
139 currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());\r\r
140 if (currentWordPos != -1) {\r\r
141 currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
142 nextWordPos = getNextWordStart(text, currentWordEnd);\r\r
143 sentanceIterator.setText(text.toString());\r\r
144 sentanceIterator.following(currentWordPos);\r\r
145 } else moreTokens = false;\r\r
146 }\r\r
147 }\r\r
148\r\r
149 /** returns true iif the current word is at the start of a sentance*/\r\r
150 public boolean isNewSentance() {\r\r
151 return startsSentance;\r\r
152 }\r\r
153\r\r
154 /** Returns the current text that is being tokenized (includes any changes\r\r
155 * that have been made)\r\r
156 */\r\r
157 public String getContext() {\r\r
158 return text.toString();\r\r
159 }\r\r
160\r\r
161 /** This method can be used to return the final text after the schecking is complete.*/\r\r
162 public String getFinalText() {\r\r
163 return getContext();\r\r
164 }\r\r
165\r\r
166\r\r
167 public static void main(String args[]) {\r\r
168 StringWordTokenizer t = new StringWordTokenizer(" This is a test problem");\r\r
169 while(t.hasMoreWords()) {\r\r
170 String word = t.nextWord();\r\r
171 System.out.println("Word is '"+word+"'");\r\r
172 if ("test".equals(word)) t.replaceWord("mightly big");\r\r
173 }\r\r
174 System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
175\r\r
176 t = new StringWordTokenizer(" README ");\r\r
177 while(t.hasMoreWords()) {\r\r
178 String word = t.nextWord();\r\r
179 System.out.println("Word is '"+word+"'");\r\r
180 }\r\r
181 System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
182\r\r
183 t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");\r\r
184 while(t.hasMoreWords()) {\r\r
185 String word = t.nextWord();\r\r
186 System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());\r\r
187 if (word.equals("acronym"))\r\r
188 t.replaceWord("test");\r\r
189 }\r\r
190 }\r\r
191}