ekit/com/swabunga/spell/event/StringWordTokenizer.java

   1 package com.swabunga.spell.event;
   2
   3 import java.util.*;
   4 import java.text.*;
   5
   6 /** This class tokenizes a input string.
   7  *  <p>
   8  *  It also allows for the string to be mutated. The result after the spell
   9  *  checking is completed is available to the call to getFinalText</p>
  10  *
  11  * @author Jason Height (jheight@chariot.net.au)
  12  */
  13 public class StringWordTokenizer implements WordTokenizer {
  14   /** Holds the start character position of the current word*/
  15   private int currentWordPos = 0;
  16   /** Holds the end character position of the current word*/
  17   private int currentWordEnd = 0;
  18   /** Holds the start character position of the next word*/
  19   private int nextWordPos = -1;
  20   /** The actual text that is being tokenized*/
  21   private StringBuffer text;
  22   /** The cumulative word count that have been processed*/
  23   private int wordCount = 0;
  24   /** Flag indicating if there are any more tokens (words) left*/
  25   private boolean moreTokens = true;
  26   /** Is this a special case where the currentWordStart, currntWordEnd and
  27    *  nextWordPos have already been calculated. (see nextWord)
  28    */
  29   private boolean first = true;
  30
  31   private BreakIterator sentanceIterator;
  32   private boolean startsSentance = true;
  33
  34
  35   public StringWordTokenizer(String text) {
  36     sentanceIterator = BreakIterator.getSentenceInstance();
  37     sentanceIterator.setText(text);
  38     sentanceIterator.first();
  39     //Wrap a string buffer to hopefully make things a bit easier and efficient to
  40     //replace words etc.
  41     this.text = new StringBuffer(text);
  42     currentWordPos = getNextWordStart(this.text, 0);
  43     //If the current word pos is -1 then the string was all white space
  44     if (currentWordPos != -1) {
  45       currentWordEnd = getNextWordEnd(this.text, currentWordPos);
  46       nextWordPos = getNextWordStart(this.text, currentWordEnd);
  47     } else {
  48       moreTokens = false;
  49     }
  50   }
  51
  52   /** This helper method will return the start character of the next
  53    * word in the buffer from the start position
  54    */
  55   private static int getNextWordStart(StringBuffer text, int startPos) {
  56     int size = text.length();
  57     for (int i=startPos;i<size;i++) {
  58       if (Character.isLetterOrDigit(text.charAt(i))) {
  59         return i;
  60       }
  61     }
  62     return -1;
  63   }
  64
  65   /** This helper method will return the end of the next word in the buffer.
  66    *
  67    */
  68   private static int getNextWordEnd(StringBuffer text, int startPos) {
  69     int size = text.length();
  70     for (int i=startPos;i<size;i++) {
  71       if (!Character.isLetterOrDigit(text.charAt(i))) {
  72         return i;
  73       }
  74     }
  75     return size;
  76   }
  77
  78
  79   /** Returns true if there are more words that can be processed in the string
  80    *
  81    */
  82   public boolean hasMoreWords() {
  83     return moreTokens;
  84   }
  85
  86   /** Returns the current character position in the text
  87    *
  88    */
  89   public int getCurrentWordPosition() {
  90     return currentWordPos;
  91   }
  92
  93   /** Returns the current end word position in the text
  94    *
  95    */
  96   public int getCurrentWordEnd() {
  97     return currentWordEnd;
  98   }
  99
 100   /** Returns the next word in the text
 101    *
 102    */
 103   public String nextWord() {
 104     if (!first) {
 105       currentWordPos = nextWordPos;
 106       currentWordEnd = getNextWordEnd(text, currentWordPos);
 107       nextWordPos = getNextWordStart(text, currentWordEnd+1);
 108       int current = sentanceIterator.current();
 109       if (current == currentWordPos)
 110         startsSentance = true;
 111       else {
 112         startsSentance = false;
 113         if (currentWordEnd > current)
 114           sentanceIterator.next();
 115       }
 116     }
 117     //The nextWordPos has already been populated
 118     String word = text.substring(currentWordPos, currentWordEnd);
 119     wordCount++;
 120     first = false;
 121     if (nextWordPos == -1)
 122       moreTokens = false;
 123     return word;
 124   }
 125
 126   /** Returns the current number of words that have been processed
 127    *
 128    */
 129   public int getCurrentWordCount() {
 130     return wordCount;
 131   }
 132
 133   /** Replaces the current word token*/
 134   public void replaceWord(String newWord) {
 135     if (currentWordPos != -1) {
 136       text.replace(currentWordPos, currentWordEnd, newWord);
 137       //Position after the newly replaced word(s)
 138       first = true;
 139       currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());
 140       if (currentWordPos != -1) {
 141         currentWordEnd = getNextWordEnd(text, currentWordPos);
 142         nextWordPos = getNextWordStart(text, currentWordEnd);
 143         sentanceIterator.setText(text.toString());
 144         sentanceIterator.following(currentWordPos);
 145       } else moreTokens = false;
 146     }
 147   }
 148
 149   /** returns true iif the current word is at the start of a sentance*/
 150   public boolean isNewSentance() {
 151     return startsSentance;
 152   }
 153
 154   /** Returns the current text that is being tokenized (includes any changes
 155    *  that have been made)
 156    */
 157   public String getContext() {
 158     return text.toString();
 159   }
 160
 161   /** This method can be used to return the final text after the schecking is complete.*/
 162   public String getFinalText() {
 163     return getContext();
 164   }
 165
 166
 167   public static void main(String args[]) {
 168     StringWordTokenizer t = new StringWordTokenizer("  This is a  test   problem");
 169     while(t.hasMoreWords()) {
 170       String word = t.nextWord();
 171       System.out.println("Word is '"+word+"'");
 172       if ("test".equals(word)) t.replaceWord("mightly big");
 173     }
 174     System.out.println("End text is: '"+t.getFinalText()+"'");
 175
 176     t = new StringWordTokenizer("    README   ");
 177     while(t.hasMoreWords()) {
 178       String word = t.nextWord();
 179       System.out.println("Word is '"+word+"'");
 180     }
 181     System.out.println("End text is: '"+t.getFinalText()+"'");
 182
 183     t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");
 184     while(t.hasMoreWords()) {
 185       String word = t.nextWord();
 186       System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());
 187       if (word.equals("acronym"))
 188         t.replaceWord("test");
 189     }
 190   }
 191 }