[old-projects.git] / ekit / com / swabunga / spell / event / StringWordTokenizer.java

package com.swabunga.spell.event;\r\r
\r\r
import java.util.*;\r\r
import java.text.*;\r\r
\r\r
/** This class tokenizes a input string.\r\r
 *  <p>\r\r
 *  It also allows for the string to be mutated. The result after the spell\r\r
 *  checking is completed is available to the call to getFinalText</p>\r\r
 *\r\r
 * @author Jason Height (jheight@chariot.net.au)\r\r
 */\r\r
public class StringWordTokenizer implements WordTokenizer {\r\r
  /** Holds the start character position of the current word*/\r\r
  private int currentWordPos = 0;\r\r
  /** Holds the end character position of the current word*/\r\r
  private int currentWordEnd = 0;\r\r
  /** Holds the start character position of the next word*/\r\r
  private int nextWordPos = -1;\r\r
  /** The actual text that is being tokenized*/\r\r
  private StringBuffer text;\r\r
  /** The cumulative word count that have been processed*/\r\r
  private int wordCount = 0;\r\r
  /** Flag indicating if there are any more tokens (words) left*/\r\r
  private boolean moreTokens = true;\r\r
  /** Is this a special case where the currentWordStart, currntWordEnd and\r\r
   *  nextWordPos have already been calculated. (see nextWord)\r\r
   */\r\r
  private boolean first = true;\r\r
\r\r
  private BreakIterator sentanceIterator;\r\r
  private boolean startsSentance = true;\r\r
\r\r
\r\r
  public StringWordTokenizer(String text) {\r\r
    sentanceIterator = BreakIterator.getSentenceInstance();\r\r
    sentanceIterator.setText(text);\r\r
    sentanceIterator.first();\r\r
    //Wrap a string buffer to hopefully make things a bit easier and efficient to\r\r
    //replace words etc.\r\r
    this.text = new StringBuffer(text);\r\r
    currentWordPos = getNextWordStart(this.text, 0);\r\r
    //If the current word pos is -1 then the string was all white space\r\r
    if (currentWordPos != -1) {\r\r
      currentWordEnd = getNextWordEnd(this.text, currentWordPos);\r\r
      nextWordPos = getNextWordStart(this.text, currentWordEnd);\r\r
    } else {\r\r
      moreTokens = false;\r\r
    }\r\r
  }\r\r
\r\r
  /** This helper method will return the start character of the next\r\r
   * word in the buffer from the start position\r\r
   */\r\r
  private static int getNextWordStart(StringBuffer text, int startPos) {\r\r
    int size = text.length();\r\r
    for (int i=startPos;i<size;i++) {\r\r
      if (Character.isLetterOrDigit(text.charAt(i))) {\r\r
        return i;\r\r
      }\r\r
    }\r\r
    return -1;\r\r
  }\r\r
\r\r
  /** This helper method will return the end of the next word in the buffer.\r\r
   *\r\r
   */\r\r
  private static int getNextWordEnd(StringBuffer text, int startPos) {\r\r
    int size = text.length();\r\r
    for (int i=startPos;i<size;i++) {\r\r
      if (!Character.isLetterOrDigit(text.charAt(i))) {\r\r
        return i;\r\r
      }\r\r
    }\r\r
    return size;\r\r
  }\r\r
\r\r
\r\r
  /** Returns true if there are more words that can be processed in the string\r\r
   *\r\r
   */\r\r
  public boolean hasMoreWords() {\r\r
    return moreTokens;\r\r
  }\r\r
\r\r
  /** Returns the current character position in the text\r\r
   *\r\r
   */\r\r
  public int getCurrentWordPosition() {\r\r
    return currentWordPos;\r\r
  }\r\r
\r\r
  /** Returns the current end word position in the text\r\r
   *\r\r
   */\r\r
  public int getCurrentWordEnd() {\r\r
    return currentWordEnd;\r\r
  }\r\r
\r\r
  /** Returns the next word in the text\r\r
   *\r\r
   */\r\r
  public String nextWord() {\r\r
    if (!first) {\r\r
      currentWordPos = nextWordPos;\r\r
      currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
      nextWordPos = getNextWordStart(text, currentWordEnd+1);\r\r
      int current = sentanceIterator.current();\r\r
      if (current == currentWordPos)\r\r
        startsSentance = true;\r\r
      else {\r\r
        startsSentance = false;\r\r
        if (currentWordEnd > current)\r\r
          sentanceIterator.next();\r\r
      }\r\r
    }\r\r
    //The nextWordPos has already been populated\r\r
    String word = text.substring(currentWordPos, currentWordEnd);\r\r
    wordCount++;\r\r
    first = false;\r\r
    if (nextWordPos == -1)\r\r
      moreTokens = false;\r\r
    return word;\r\r
  }\r\r
\r\r
  /** Returns the current number of words that have been processed\r\r
   *\r\r
   */\r\r
  public int getCurrentWordCount() {\r\r
    return wordCount;\r\r
  }\r\r
\r\r
  /** Replaces the current word token*/\r\r
  public void replaceWord(String newWord) {\r\r
    if (currentWordPos != -1) {\r\r
      text.replace(currentWordPos, currentWordEnd, newWord);\r\r
      //Position after the newly replaced word(s)\r\r
      first = true;\r\r
      currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());\r\r
      if (currentWordPos != -1) {\r\r
        currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
        nextWordPos = getNextWordStart(text, currentWordEnd);\r\r
        sentanceIterator.setText(text.toString());\r\r
        sentanceIterator.following(currentWordPos);\r\r
      } else moreTokens = false;\r\r
    }\r\r
  }\r\r
\r\r
  /** returns true iif the current word is at the start of a sentance*/\r\r
  public boolean isNewSentance() {\r\r
    return startsSentance;\r\r
  }\r\r
\r\r
  /** Returns the current text that is being tokenized (includes any changes\r\r
   *  that have been made)\r\r
   */\r\r
  public String getContext() {\r\r
    return text.toString();\r\r
  }\r\r
\r\r
  /** This method can be used to return the final text after the schecking is complete.*/\r\r
  public String getFinalText() {\r\r
    return getContext();\r\r
  }\r\r
\r\r
\r\r
  public static void main(String args[]) {\r\r
    StringWordTokenizer t = new StringWordTokenizer("  This is a  test   problem");\r\r
    while(t.hasMoreWords()) {\r\r
      String word = t.nextWord();\r\r
      System.out.println("Word is '"+word+"'");\r\r
      if ("test".equals(word)) t.replaceWord("mightly big");\r\r
    }\r\r
    System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
\r\r
    t = new StringWordTokenizer("    README   ");\r\r
    while(t.hasMoreWords()) {\r\r
      String word = t.nextWord();\r\r
      System.out.println("Word is '"+word+"'");\r\r
    }\r\r
    System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
\r\r
    t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");\r\r
    while(t.hasMoreWords()) {\r\r
      String word = t.nextWord();\r\r
      System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());\r\r
      if (word.equals("acronym"))\r\r
        t.replaceWord("test");\r\r
    }\r\r
  }\r\r
}
Commit	Line	Data
	1	package com.swabunga.spell.event;\r\r
	2	\r\r
	3	import java.util.*;\r\r
	4	import java.text.*;\r\r
	5	\r\r
	6	/** This class tokenizes a input string.\r\r
	7	* <p>\r\r
	8	* It also allows for the string to be mutated. The result after the spell\r\r
	9	* checking is completed is available to the call to getFinalText</p>\r\r
	10	*\r\r
	11	* @author Jason Height (jheight@chariot.net.au)\r\r
	12	*/\r\r
	13	public class StringWordTokenizer implements WordTokenizer {\r\r
	14	/** Holds the start character position of the current word*/\r\r
	15	private int currentWordPos = 0;\r\r
	16	/** Holds the end character position of the current word*/\r\r
	17	private int currentWordEnd = 0;\r\r
	18	/** Holds the start character position of the next word*/\r\r
	19	private int nextWordPos = -1;\r\r
	20	/** The actual text that is being tokenized*/\r\r
	21	private StringBuffer text;\r\r
	22	/** The cumulative word count that have been processed*/\r\r
	23	private int wordCount = 0;\r\r
	24	/** Flag indicating if there are any more tokens (words) left*/\r\r
	25	private boolean moreTokens = true;\r\r
	26	/** Is this a special case where the currentWordStart, currntWordEnd and\r\r
	27	* nextWordPos have already been calculated. (see nextWord)\r\r
	28	*/\r\r
	29	private boolean first = true;\r\r
	30	\r\r
	31	private BreakIterator sentanceIterator;\r\r
	32	private boolean startsSentance = true;\r\r
	33	\r\r
	34	\r\r
	35	public StringWordTokenizer(String text) {\r\r
	36	sentanceIterator = BreakIterator.getSentenceInstance();\r\r
	37	sentanceIterator.setText(text);\r\r
	38	sentanceIterator.first();\r\r
	39	//Wrap a string buffer to hopefully make things a bit easier and efficient to\r\r
	40	//replace words etc.\r\r
	41	this.text = new StringBuffer(text);\r\r
	42	currentWordPos = getNextWordStart(this.text, 0);\r\r
	43	//If the current word pos is -1 then the string was all white space\r\r
	44	if (currentWordPos != -1) {\r\r
	45	currentWordEnd = getNextWordEnd(this.text, currentWordPos);\r\r
	46	nextWordPos = getNextWordStart(this.text, currentWordEnd);\r\r
	47	} else {\r\r
	48	moreTokens = false;\r\r
	49	}\r\r
	50	}\r\r
	51	\r\r
	52	/** This helper method will return the start character of the next\r\r
	53	* word in the buffer from the start position\r\r
	54	*/\r\r
	55	private static int getNextWordStart(StringBuffer text, int startPos) {\r\r
	56	int size = text.length();\r\r
	57	for (int i=startPos;i<size;i++) {\r\r
	58	if (Character.isLetterOrDigit(text.charAt(i))) {\r\r
	59	return i;\r\r
	60	}\r\r
	61	}\r\r
	62	return -1;\r\r
	63	}\r\r
	64	\r\r
	65	/** This helper method will return the end of the next word in the buffer.\r\r
	66	*\r\r
	67	*/\r\r
	68	private static int getNextWordEnd(StringBuffer text, int startPos) {\r\r
	69	int size = text.length();\r\r
	70	for (int i=startPos;i<size;i++) {\r\r
	71	if (!Character.isLetterOrDigit(text.charAt(i))) {\r\r
	72	return i;\r\r
	73	}\r\r
	74	}\r\r
	75	return size;\r\r
	76	}\r\r
	77	\r\r
	78	\r\r
	79	/** Returns true if there are more words that can be processed in the string\r\r
	80	*\r\r
	81	*/\r\r
	82	public boolean hasMoreWords() {\r\r
	83	return moreTokens;\r\r
	84	}\r\r
	85	\r\r
	86	/** Returns the current character position in the text\r\r
	87	*\r\r
	88	*/\r\r
	89	public int getCurrentWordPosition() {\r\r
	90	return currentWordPos;\r\r
	91	}\r\r
	92	\r\r
	93	/** Returns the current end word position in the text\r\r
	94	*\r\r
	95	*/\r\r
	96	public int getCurrentWordEnd() {\r\r
	97	return currentWordEnd;\r\r
	98	}\r\r
	99	\r\r
	100	/** Returns the next word in the text\r\r
	101	*\r\r
	102	*/\r\r
	103	public String nextWord() {\r\r
	104	if (!first) {\r\r
	105	currentWordPos = nextWordPos;\r\r
	106	currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
	107	nextWordPos = getNextWordStart(text, currentWordEnd+1);\r\r
	108	int current = sentanceIterator.current();\r\r
	109	if (current == currentWordPos)\r\r
	110	startsSentance = true;\r\r
	111	else {\r\r
	112	startsSentance = false;\r\r
	113	if (currentWordEnd > current)\r\r
	114	sentanceIterator.next();\r\r
	115	}\r\r
	116	}\r\r
	117	//The nextWordPos has already been populated\r\r
	118	String word = text.substring(currentWordPos, currentWordEnd);\r\r
	119	wordCount++;\r\r
	120	first = false;\r\r
	121	if (nextWordPos == -1)\r\r
	122	moreTokens = false;\r\r
	123	return word;\r\r
	124	}\r\r
	125	\r\r
	126	/** Returns the current number of words that have been processed\r\r
	127	*\r\r
	128	*/\r\r
	129	public int getCurrentWordCount() {\r\r
	130	return wordCount;\r\r
	131	}\r\r
	132	\r\r
	133	/** Replaces the current word token*/\r\r
	134	public void replaceWord(String newWord) {\r\r
	135	if (currentWordPos != -1) {\r\r
	136	text.replace(currentWordPos, currentWordEnd, newWord);\r\r
	137	//Position after the newly replaced word(s)\r\r
	138	first = true;\r\r
	139	currentWordPos = getNextWordStart(text, currentWordPos+newWord.length());\r\r
	140	if (currentWordPos != -1) {\r\r
	141	currentWordEnd = getNextWordEnd(text, currentWordPos);\r\r
	142	nextWordPos = getNextWordStart(text, currentWordEnd);\r\r
	143	sentanceIterator.setText(text.toString());\r\r
	144	sentanceIterator.following(currentWordPos);\r\r
	145	} else moreTokens = false;\r\r
	146	}\r\r
	147	}\r\r
	148	\r\r
	149	/** returns true iif the current word is at the start of a sentance*/\r\r
	150	public boolean isNewSentance() {\r\r
	151	return startsSentance;\r\r
	152	}\r\r
	153	\r\r
	154	/** Returns the current text that is being tokenized (includes any changes\r\r
	155	* that have been made)\r\r
	156	*/\r\r
	157	public String getContext() {\r\r
	158	return text.toString();\r\r
	159	}\r\r
	160	\r\r
	161	/** This method can be used to return the final text after the schecking is complete.*/\r\r
	162	public String getFinalText() {\r\r
	163	return getContext();\r\r
	164	}\r\r
	165	\r\r
	166	\r\r
	167	public static void main(String args[]) {\r\r
	168	StringWordTokenizer t = new StringWordTokenizer(" This is a test problem");\r\r
	169	while(t.hasMoreWords()) {\r\r
	170	String word = t.nextWord();\r\r
	171	System.out.println("Word is '"+word+"'");\r\r
	172	if ("test".equals(word)) t.replaceWord("mightly big");\r\r
	173	}\r\r
	174	System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
	175	\r\r
	176	t = new StringWordTokenizer(" README ");\r\r
	177	while(t.hasMoreWords()) {\r\r
	178	String word = t.nextWord();\r\r
	179	System.out.println("Word is '"+word+"'");\r\r
	180	}\r\r
	181	System.out.println("End text is: '"+t.getFinalText()+"'");\r\r
	182	\r\r
	183	t = new StringWordTokenizer("This is a acronym (A.C.M.E). This is the second sentance.");\r\r
	184	while(t.hasMoreWords()) {\r\r
	185	String word = t.nextWord();\r\r
	186	System.out.println("Word is '"+word+"'. Starts Sentance?="+t.isNewSentance());\r\r
	187	if (word.equals("acronym"))\r\r
	188	t.replaceWord("test");\r\r
	189	}\r\r
	190	}\r\r
	191	}