| 1 | package com.swabunga.spell.event;\r\r |
| 2 | \r\r |
| 3 | import java.util.*;\r\r |
| 4 | import java.text.*;\r\r |
| 5 | \r\r |
| 6 | /** This class tokenizes a input string.\r\r |
| 7 | * <p>\r\r |
| 8 | * It also allows for the string to be mutated. The result after the spell\r\r |
| 9 | * checking is completed is available to the call to getFinalText</p>\r\r |
| 10 | *\r\r |
| 11 | * @author Jason Height (jheight@chariot.net.au)\r\r |
| 12 | */\r\r |
| 13 | public class StringWordTokenizer implements WordTokenizer {\r\r |
| 14 | /** Holds the start character position of the current word*/\r\r |
| 15 | private int currentWordPos = 0;\r\r |
| 16 | /** Holds the end character position of the current word*/\r\r |
| 17 | private int currentWordEnd = 0;\r\r |
| 18 | /** Holds the start character position of the next word*/\r\r |
| 19 | private int nextWordPos = -1;\r\r |
| 20 | /** The actual text that is being tokenized*/\r\r |
| 21 | private StringBuffer text;\r\r |
| 22 | /** The cumulative word count that have been processed*/\r\r |
| 23 | private int wordCount = 0;\r\r |
| 24 | /** Flag indicating if there are any more tokens (words) left*/\r\r |
| 25 | private boolean moreTokens = true;\r\r |
| 26 | /** Is this a special case where the currentWordStart, currntWordEnd and\r\r |
| 27 | * nextWordPos have already been calculated. (see nextWord)\r\r |
| 28 | */\r\r |
| 29 | private boolean first = true;\r\r |
| 30 | \r\r |
| 31 | private BreakIterator sentanceIterator;\r\r |
| 32 | private boolean startsSentance = true;\r\r |
| 33 | \r\r |
| 34 | \r\r |
| 35 | public StringWordTokenizer(String text) {\r\r |
| 36 | sentanceIterator = BreakIterator.getSentenceInstance();\r\r |
| 37 | sentanceIterator.setText(text);\r\r |
| 38 | sentanceIterator.first();\r\r |
| 39 | //Wrap a string buffer to hopefully make things a bit easier and efficient to\r\r |
| 40 | //replace words etc.\r\r |
| 41 | this.text = new StringBuffer(text);\r\r |
| 42 | currentWordPos = getNextWordStart(this.text, 0);\r\r |
| 43 | //If the current word pos is -1 then the string was all white space\r\r |
| 44 | if (currentWordPos != -1) {\r\r |
| 45 | currentWordEnd = getNextWordEnd(this.text, currentWordPos);\r\r |
| 46 | nextWordPos = getNextWordStart(this.text, currentWordEnd);\r\r |
| 47 | } else {\r\r |
| 48 | moreTokens = false;\r\r |
| 49 | }\r\r |
| 50 | }\r\r |
| 51 | \r\r |
| 52 | /** This helper method will return the start character of the next\r\r |
| 53 | * word in the buffer from the start position\r\r |
| 54 | */\r\r |
| 55 | private static int getNextWordStart(StringBuffer text, int startPos) {\r\r |
| 56 | int size = text.length();\r\r |
| 57 | for (int i=startPos;i<size;i++) {\r\r |
| 58 | if (Character.isLetterOrDigit(text.charAt(i))) {\r\r |
| 59 | return i;\r\r |
| 60 | }\r\r |
| 61 | }\r\r |
| 62 | return -1;\r\r |
| 63 | }\r\r |
| 64 | \r\r |
| 65 | /** This helper method will return the end of the next word in the buffer.\r\r |
| 66 | *\r\r |
| 67 | */\r\r |
| 68 | private static int getNextWordEnd(StringBuffer text, int startPos) {\r\r |
| 69 | int size = text.length();\r\r |
| 70 | for (int i=startPos;i<size;i++) {\r\r |
| 71 | if (!Character.isLetterOrDigit(text.charAt(i))) {\r\r |
| 72 | return i;\r\r |
| 73 | }\r\r |
| 74 | }\r\r |
| 75 | return size;\r\r |
| 76 | }\r\r |
| 77 | \r\r |
| 78 | \r\r |
| 79 | /** Returns true if there are more words that can be processed in the string\r\r |
| 80 | *\r\r |
| 81 | */\r\r |
| 82 | public boolean hasMoreWords() {\r\r |
| 83 | return moreTokens;\r\r |
| 84 | }\r\r |
| 85 | \r\r |
| 86 | /** Returns the current character position in the text\r\r |
| 87 | *\r\r |
| 88 | */\r\r |
| 89 | public int getCurrentWordPosition() {\r\r |
| 90 | return currentWordPos;\r\r |
| 91 | }\r\r |
| 92 | \r\r |
| 93 | /** Returns the current end word position in the text\r\r |
| 94 | *\r\r |
| 95 | */\r\r |
| 96 | public int getCurrentWordEnd() {\r\r |
| 97 | return currentWordEnd;\r\r |
| 98 | }\r\r |
| 99 | \r\r |
| 100 | /** Returns the next word in the text\r\r |
| 101 | *\r\r |
| 102 | */\r\r |
| 103 | public String nextWord() {\r\r |
| 104 | if (!first) {\r\r |
| 105 | currentWordPos = nextWordPos;\r\r |
| 106 | currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r |
| 107 | nextWordPos = getNextWordStart(text, currentWordEnd+1);\r\r |
| 108 | int current = sentanceIterator.current();\r\r |
| 109 | if (current == currentWordPos)\r\r |
| 110 | startsSentance = true;\r\r |
| 111 | else {\r\r |
| 112 | startsSentance = false;\r\r |
| 113 | if (currentWordEnd > current)\r\r |
| 114 | sentanceIterator.next();\r\r |
| 115 | }\r\r |
| 116 | }\r\r |
| 117 | //The nextWordPos has already been populated\r\r |
| 118 | String word = text.substring(currentWordPos, currentWordEnd);\r\r |
| 119 | wordCount++;\r\r |
| 120 | first = false;\r\r |
| 121 | if (nextWordPos == -1)\r\r |
| 122 | moreTokens = false;\r\r |
| 123 | return word;\r\r |
| 124 | }\r\r |
| 125 | \r\r |
| 126 | /** Returns the current number of words that have been processed\r\r |
| 127 | *\r\r |
| 128 | */\r\r |
| 129 | public int getCurrentWordCount() {\r\r |
| 130 | return wordCount;\r\r |
| 131 | }\r\r |
| 132 | \r\r |
| 133 | /** Replaces the current word token*/\r\r |
| 134 | public void replaceWord(String newWord) {\r\r |
| 135 | if (currentWordPos != -1) {\r\r |
| 136 | text.replace(currentWordPos, currentWordEnd, newWord);\r\r |
| 137 | //Position after the newly replaced word(s)\r\r |
| 138 | first = true;\r\r |
| 139 | currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());\r\r |
| 140 | if (currentWordPos != -1) {\r\r |
| 141 | currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r |
| 142 | nextWordPos = getNextWordStart(text, currentWordEnd);\r\r |
| 143 | sentanceIterator.setText(text.toString());\r\r |
| 144 | sentanceIterator.following(currentWordPos);\r\r |
| 145 | } else moreTokens = false;\r\r |
| 146 | }\r\r |
| 147 | }\r\r |
| 148 | \r\r |
| 149 | /** returns true iif the current word is at the start of a sentance*/\r\r |
| 150 | public boolean isNewSentance() {\r\r |
| 151 | return startsSentance;\r\r |
| 152 | }\r\r |
| 153 | \r\r |
| 154 | /** Returns the current text that is being tokenized (includes any changes\r\r |
| 155 | * that have been made)\r\r |
| 156 | */\r\r |
| 157 | public String getContext() {\r\r |
| 158 | return text.toString();\r\r |
| 159 | }\r\r |
| 160 | \r\r |
| 161 | /** This method can be used to return the final text after the schecking is complete.*/\r\r |
| 162 | public String getFinalText() {\r\r |
| 163 | return getContext();\r\r |
| 164 | }\r\r |
| 165 | \r\r |
| 166 | \r\r |
| 167 | public static void main(String args[]) {\r\r |
| 168 | StringWordTokenizer t = new StringWordTokenizer(" This is a test problem");\r\r |
| 169 | while(t.hasMoreWords()) {\r\r |
| 170 | String word = t.nextWord();\r\r |
| 171 | System.out.println("Word is '"+word+"'");\r\r |
| 172 | if ("test".equals(word)) t.replaceWord("mightly big");\r\r |
| 173 | }\r\r |
| 174 | System.out.println("End text is: '"+t.getFinalText()+"'");\r\r |
| 175 | \r\r |
| 176 | t = new StringWordTokenizer(" README ");\r\r |
| 177 | while(t.hasMoreWords()) {\r\r |
| 178 | String word = t.nextWord();\r\r |
| 179 | System.out.println("Word is '"+word+"'");\r\r |
| 180 | }\r\r |
| 181 | System.out.println("End text is: '"+t.getFinalText()+"'");\r\r |
| 182 | \r\r |
| 183 | t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");\r\r |
| 184 | while(t.hasMoreWords()) {\r\r |
| 185 | String word = t.nextWord();\r\r |
| 186 | System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());\r\r |
| 187 | if (word.equals("acronym"))\r\r |
| 188 | t.replaceWord("test");\r\r |
| 189 | }\r\r |
| 190 | }\r\r |
| 191 | } |