Initial revision

[old-projects.git] / ekit / com / swabunga / spell / event / DocumentWordTokenizer.java
diff --git a/ekit/com/swabunga/spell/event/DocumentWordTokenizer.java b/ekit/com/swabunga/spell/event/DocumentWordTokenizer.java

new file mode 100644 (file)

index 0000000..566f92a
--- /dev/null
+++ b/ekit/com/swabunga/spell/event/DocumentWordTokenizer.java
@@ -0,0 +1,192 @@
+package com.swabunga.spell.event;\r\r
+\r\r
+import java.util.*;\r\r
+import java.text.*;\r\r
+import javax.swing.text.AttributeSet;\r\r
+import javax.swing.text.Document;\r\r
+import javax.swing.text.Element;\r\r
+import javax.swing.text.Segment;\r\r
+import javax.swing.text.BadLocationException;\r\r
+\r\r
+/** This class tokenizes a swing document model. It also allows for the\r\r
+ *  document model to be changed when corrections occur.\r\r
+ *\r\r
+ * @author Jason Height (jheight@chariot.net.au)\r\r
+ */\r\r
+public class DocumentWordTokenizer implements WordTokenizer {\r\r
+  /** Holds the start character position of the current word*/\r\r
+  private int currentWordPos = 0;\r\r
+  /** Holds the end character position of the current word*/\r\r
+  private int currentWordEnd = 0;\r\r
+  /** Holds the start character position of the next word*/\r\r
+  private int nextWordPos = -1;\r\r
+  /** The actual text that is being tokenized*/\r\r
+  private Document document;\r\r
+  /** The character iterator over the document*/\r\r
+  private Segment text;\r\r
+  /** The cumulative word count that have been processed*/\r\r
+  private int wordCount = 0;\r\r
+  /** Flag indicating if there are any more tokens (words) left*/\r\r
+  private boolean moreTokens = true;\r\r
+  /** Is this a special case where the currentWordStart, currntWordEnd and\r\r
+   *  nextWordPos have already been calculated. (see nextWord)\r\r
+   */\r\r
+  private boolean first = true;\r\r
+\r\r
+  private BreakIterator sentanceIterator;\r\r
+  private boolean startsSentance = true;\r\r
+\r\r
+\r\r
+  public DocumentWordTokenizer(Document document) {\r\r
+    this.document = document;\r\r
+    //Create a text segment over the etire document\r\r
+    text = new Segment();\r\r
+    sentanceIterator = BreakIterator.getSentenceInstance();\r\r
+    try {\r\r
+      document.getText(0, document.getLength(), text);\r\r
+      sentanceIterator.setText(text);\r\r
+      currentWordPos = getNextWordStart(text, 0);\r\r
+      //If the current word pos is -1 then the string was all white space\r\r
+      if (currentWordPos != -1) {\r\r
+        currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
+        nextWordPos = getNextWordStart(text, currentWordEnd);\r\r
+      } else {\r\r
+        moreTokens = false;\r\r
+      }\r\r
+    } catch (BadLocationException ex) {\r\r
+      moreTokens = false;\r\r
+    }\r\r
+  }\r\r
+\r\r
+  /** This helper method will return the start character of the next\r\r
+   * word in the buffer from the start position\r\r
+   */\r\r
+  private static int getNextWordStart(Segment text, int startPos) {\r\r
+    if (startPos <= text.getEndIndex())\r\r
+      for (char ch = text.setIndex(startPos);ch != Segment.DONE;ch = text.next()) {\r\r
+        if (Character.isLetterOrDigit(ch)) {\r\r
+          return text.getIndex();\r\r
+        }\r\r
+      }\r\r
+    return -1;\r\r
+  }\r\r
+\r\r
+  /** This helper method will return the end of the next word in the buffer.\r\r
+   *\r\r
+   */\r\r
+  private static int getNextWordEnd(Segment text, int startPos) {\r\r
+    for (char ch = text.setIndex(startPos); ch != Segment.DONE;ch = text.next()) {\r\r
+      if (!Character.isLetterOrDigit(ch)) {\r\r
+        return text.getIndex();\r\r
+      }\r\r
+    }\r\r
+    return text.getEndIndex();\r\r
+  }\r\r
+\r\r
+\r\r
+  /** Returns true if there are more words that can be processed in the string\r\r
+   *\r\r
+   */\r\r
+  public boolean hasMoreWords() {\r\r
+    return moreTokens;\r\r
+  }\r\r
+\r\r
+  /** Returns the current character position in the text\r\r
+   *\r\r
+   */\r\r
+  public int getCurrentWordPosition() {\r\r
+    return currentWordPos;\r\r
+  }\r\r
+\r\r
+  /** Returns the current end word position in the text\r\r
+   *\r\r
+   */\r\r
+  public int getCurrentWordEnd() {\r\r
+    return currentWordEnd;\r\r
+  }\r\r
+\r\r
+\r\r
+  /** Returns the next word in the text\r\r
+   *\r\r
+   */\r\r
+  public String nextWord() {\r\r
+    if (!first) {\r\r
+      currentWordPos = nextWordPos;\r\r
+      currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
+      nextWordPos = getNextWordStart(text, currentWordEnd+1);\r\r
+      int current = sentanceIterator.current();\r\r
+      if (current == currentWordPos)\r\r
+        startsSentance = true;\r\r
+      else {\r\r
+        startsSentance = false;\r\r
+        if (currentWordEnd > current)\r\r
+          sentanceIterator.next();\r\r
+      }\r\r
+\r\r
+    }\r\r
+    //The nextWordPos has already been populated\r\r
+    String word = null;\r\r
+    try {\r\r
+      word = document.getText(currentWordPos, currentWordEnd-currentWordPos);\r\r
+    } catch (BadLocationException ex) {\r\r
+      moreTokens = false;\r\r
+    }\r\r
+    wordCount++;\r\r
+    first = false;\r\r
+    if (nextWordPos == -1)\r\r
+      moreTokens = false;\r\r
+    return word;\r\r
+  }\r\r
+\r\r
+  /** Returns the current number of words that have been processed\r\r
+   *\r\r
+   */\r\r
+  public int getCurrentWordCount() {\r\r
+    return wordCount;\r\r
+  }\r\r
+\r\r
+  /** Replaces the current word token*/\r\r
+  public void replaceWord(String newWord) {\r\r
+    if (currentWordPos != -1) {\r\r
+      try {\r\r
+      /* ORIGINAL\r\r
+        document.remove(currentWordPos, currentWordEnd - currentWordPos);\r\r
+        document.insertString(currentWordPos, newWord, null);\r\r
+      */\r\r
+      // Howard's Version for Ekit\r\r
+               Element element = ((javax.swing.text.html.HTMLDocument)document).getCharacterElement(currentWordPos);\r\r
+               AttributeSet attribs = element.getAttributes();\r\r
+        document.remove(currentWordPos, currentWordEnd - currentWordPos);\r\r
+        document.insertString(currentWordPos, newWord, attribs);\r\r
+      // End Howard's Version\r\r
+        //Need to reset the segment\r\r
+        document.getText(0, document.getLength(), text);\r\r
+      } catch (BadLocationException ex) {\r\r
+        throw new RuntimeException(ex.getMessage());\r\r
+      }\r\r
+      //Position after the newly replaced word(s)\r\r
+      //Position after the newly replaced word(s)\r\r
+      first = true;\r\r
+      currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());\r\r
+      if (currentWordPos != -1) {\r\r
+        currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
+        nextWordPos = getNextWordStart(text, currentWordEnd);\r\r
+        sentanceIterator.setText(text);\r\r
+        sentanceIterator.following(currentWordPos);\r\r
+      } else moreTokens = false;\r\r
+    }\r\r
+  }\r\r
+\r\r
+  /** Returns the current text that is being tokenized (includes any changes\r\r
+   *  that have been made)\r\r
+   */\r\r
+  public String getContext() {\r\r
+    return text.toString();\r\r
+  }\r\r
+\r\r
+  /** Returns true iif the current word is at the start of a sentance*/\r\r
+  public boolean isNewSentance() {\r\r
+    return startsSentance;\r\r
+  }\r\r
+\r\r
+}
+\ No newline at end of file