3 * base include file for SimpleTest
5 * @subpackage MockObjects
6 * @version $Id: parser.php,v 1.61 2004/08/11 16:14:37 lastcraft Exp $
10 * Lexer mode stack constants
12 define("LEXER_ENTER", 1);
13 define("LEXER_MATCHED", 2);
14 define("LEXER_UNMATCHED", 3);
15 define("LEXER_EXIT", 4);
16 define("LEXER_SPECIAL", 5);
20 * Compounded regular expression. Any of
21 * the contained patterns could match and
22 * when one does it's label is returned.
24 * @subpackage WebTester
33 * Constructor. Starts with no patterns.
34 * @param boolean $case True for case sensitive, false
38 function ParallelRegex($case) {
40 $this->_patterns
= array();
41 $this->_labels
= array();
46 * Adds a pattern with an optional label.
47 * @param string $pattern Perl style regex, but ( and )
48 * lose the usual meaning.
49 * @param string $label Label of regex to be returned
53 function addPattern($pattern, $label = true
) {
54 $count = count($this->_patterns
);
55 $this->_patterns
[$count] = $pattern;
56 $this->_labels
[$count] = $label;
61 * Attempts to match all patterns at once against
63 * @param string $subject String to match against.
64 * @param string $match First matched portion of
66 * @return boolean True on success.
69 function match($subject, &$match) {
70 if (count($this->_patterns
) == 0) {
73 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
78 for ($i = 1; $i < count($matches); $i++
) {
80 return $this->_labels
[$i - 1];
87 * Compounds the patterns into a single
88 * regular expression separated with the
89 * "or" operator. Caches the regex.
90 * Will automatically escape (, ) and / tokens.
91 * @param array $patterns List of patterns in order.
94 function _getCompoundedRegex() {
95 if ($this->_regex
== null
) {
96 for ($i = 0; $i < count($this->_patterns
); $i++
) {
97 $this->_patterns
[$i] = '(' . str_replace(
99 array('\/', '\(', '\)'),
100 $this->_patterns
[$i]) . ')';
102 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
104 return $this->_regex
;
108 * Accessor for perl regex mode flags to use.
109 * @return string Perl regex flags.
112 function _getPerlMatchingFlags() {
113 return ($this->_case ?
"msS" : "msSi");
118 * States for a stack machine.
119 * @package SimpleTest
120 * @subpackage WebTester
122 class SimpleStateStack
{
126 * Constructor. Starts in named state.
127 * @param string $start Starting state name.
130 function SimpleStateStack($start) {
131 $this->_stack
= array($start);
135 * Accessor for current state.
136 * @return string State.
139 function getCurrent() {
140 return $this->_stack
[count($this->_stack
) - 1];
144 * Adds a state to the stack and sets it
145 * to be the current state.
146 * @param string $state New state.
149 function enter($state) {
150 array_push($this->_stack
, $state);
154 * Leaves the current state and reverts
155 * to the previous one.
156 * @return boolean False if we drop off
157 * the bottom of the list.
161 if (count($this->_stack
) == 1) {
164 array_pop($this->_stack
);
170 * Accepts text and breaks it into tokens.
171 * Some optimisation to make the sure the
172 * content is only scanned by the PHP regex
173 * parser once. Lexer modes must not start
174 * with leading underscores.
175 * @package SimpleTest
176 * @subpackage WebTester
186 * Sets up the lexer in case insensitive matching
188 * @param SimpleSaxParser $parser Handling strategy by
190 * @param string $start Starting handler.
191 * @param boolean $case True for case sensitive.
194 function SimpleLexer(&$parser, $start = "accept", $case = false
) {
195 $this->_case
= $case;
196 $this->_regexes
= array();
197 $this->_parser
= &$parser;
198 $this->_mode
= &new SimpleStateStack($start);
199 $this->_mode_handlers
= array($start => $start);
203 * Adds a token search pattern for a particular
204 * parsing mode. The pattern does not change the
206 * @param string $pattern Perl style regex, but ( and )
207 * lose the usual meaning.
208 * @param string $mode Should only apply this
209 * pattern when dealing with
210 * this type of input.
213 function addPattern($pattern, $mode = "accept") {
214 if (! isset($this->_regexes
[$mode])) {
215 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
217 $this->_regexes
[$mode]->addPattern($pattern);
218 if (! isset($this->_mode_handlers
[$mode])) {
219 $this->_mode_handlers
[$mode] = $mode;
224 * Adds a pattern that will enter a new parsing
225 * mode. Useful for entering parenthesis, strings,
227 * @param string $pattern Perl style regex, but ( and )
228 * lose the usual meaning.
229 * @param string $mode Should only apply this
230 * pattern when dealing with
231 * this type of input.
232 * @param string $new_mode Change parsing to this new
236 function addEntryPattern($pattern, $mode, $new_mode) {
237 if (! isset($this->_regexes
[$mode])) {
238 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
240 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
241 if (! isset($this->_mode_handlers
[$new_mode])) {
242 $this->_mode_handlers
[$new_mode] = $new_mode;
247 * Adds a pattern that will exit the current mode
248 * and re-enter the previous one.
249 * @param string $pattern Perl style regex, but ( and )
250 * lose the usual meaning.
251 * @param string $mode Mode to leave.
254 function addExitPattern($pattern, $mode) {
255 if (! isset($this->_regexes
[$mode])) {
256 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
258 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
259 if (! isset($this->_mode_handlers
[$mode])) {
260 $this->_mode_handlers
[$mode] = $mode;
265 * Adds a pattern that has a special mode. Acts as an entry
266 * and exit pattern in one go, effectively calling a special
267 * parser handler for this token only.
268 * @param string $pattern Perl style regex, but ( and )
269 * lose the usual meaning.
270 * @param string $mode Should only apply this
271 * pattern when dealing with
272 * this type of input.
273 * @param string $special Use this mode for this one token.
276 function addSpecialPattern($pattern, $mode, $special) {
277 if (! isset($this->_regexes
[$mode])) {
278 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
280 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
281 if (! isset($this->_mode_handlers
[$special])) {
282 $this->_mode_handlers
[$special] = $special;
287 * Adds a mapping from a mode to another handler.
288 * @param string $mode Mode to be remapped.
289 * @param string $handler New target handler.
292 function mapHandler($mode, $handler) {
293 $this->_mode_handlers
[$mode] = $handler;
297 * Splits the page text into tokens. Will fail
298 * if the handlers report an error or if no
299 * content is consumed. If successful then each
300 * unparsed and parsed token invokes a call to the
302 * @param string $raw Raw HTML text.
303 * @return boolean True on success, else false.
306 function parse($raw) {
307 if (! isset($this->_parser
)) {
310 $length = strlen($raw);
311 while (is_array($parsed = $this->_reduce($raw))) {
312 list($raw, $unmatched, $matched, $mode) = $parsed;
313 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
319 if (strlen($raw) == $length) {
322 $length = strlen($raw);
327 return $this->_invokeParser($raw, LEXER_UNMATCHED
);
331 * Sends the matched token and any leading unmatched
332 * text to the parser changing the lexer to a new
333 * mode if one is listed.
334 * @param string $unmatched Unmatched leading portion.
335 * @param string $matched Actual token match.
336 * @param string $mode Mode after match. A boolean
337 * false mode causes no change.
338 * @return boolean False if there was any error
342 function _dispatchTokens($unmatched, $matched, $mode = false
) {
343 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED
)) {
346 if (is_bool($mode)) {
347 return $this->_invokeParser($matched, LEXER_MATCHED
);
349 if ($this->_isModeEnd($mode)) {
350 if (! $this->_invokeParser($matched, LEXER_EXIT
)) {
353 return $this->_mode
->leave();
355 if ($this->_isSpecialMode($mode)) {
356 $this->_mode
->enter($this->_decodeSpecial($mode));
357 if (! $this->_invokeParser($matched, LEXER_SPECIAL
)) {
360 return $this->_mode
->leave();
362 $this->_mode
->enter($mode);
363 return $this->_invokeParser($matched, LEXER_ENTER
);
367 * Tests to see if the new mode is actually to leave
368 * the current mode and pop an item from the matching
370 * @param string $mode Mode to test.
371 * @return boolean True if this is the exit mode.
374 function _isModeEnd($mode) {
375 return ($mode === "__exit");
379 * Test to see if the mode is one where this mode
380 * is entered for this token only and automatically
381 * leaves immediately afterwoods.
382 * @param string $mode Mode to test.
383 * @return boolean True if this is the exit mode.
386 function _isSpecialMode($mode) {
387 return (strncmp($mode, "_", 1) == 0);
391 * Strips the magic underscore marking single token
393 * @param string $mode Mode to decode.
394 * @return string Underlying mode name.
397 function _decodeSpecial($mode) {
398 return substr($mode, 1);
402 * Calls the parser method named after the current
403 * mode. Empty content will be ignored. The lexer
404 * has a parser handler for each mode in the lexer.
405 * @param string $content Text parsed.
406 * @param boolean $is_match Token is recognised rather
407 * than unparsed data.
410 function _invokeParser($content, $is_match) {
411 if (($content === '') ||
($content === false
)) {
414 $handler = $this->_mode_handlers
[$this->_mode
->getCurrent()];
415 return $this->_parser
->$handler($content, $is_match);
419 * Tries to match a chunk of text and if successful
420 * removes the recognised chunk and any leading
421 * unparsed data. Empty strings will not be matched.
422 * @param string $raw The subject to parse. This is the
423 * content that will be eaten.
424 * @return array Three item list of unparsed
425 * content followed by the
426 * recognised token and finally the
427 * action the parser is to take.
428 * True if no match, false if there
429 * is a parsing error.
432 function _reduce($raw) {
433 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->match($raw, $match)) {
434 $unparsed_character_count = strpos($raw, $match);
435 $unparsed = substr($raw, 0, $unparsed_character_count);
436 $raw = substr($raw, $unparsed_character_count +
strlen($match));
437 return array($raw, $unparsed, $match, $action);
444 * Converts HTML tokens into selected SAX events.
445 * @package SimpleTest
446 * @subpackage WebTester
448 class SimpleSaxParser
{
453 var $_current_attribute;
457 * @param SimpleSaxListener $listener SAX event handler.
460 function SimpleSaxParser(&$listener) {
461 $this->_listener
= &$listener;
462 $this->_lexer
= &$this->createLexer($this);
464 $this->_attributes
= array();
465 $this->_current_attribute
= '';
469 * Runs the content through the lexer which
470 * should call back to the acceptors.
471 * @param string $raw Page text to parse.
472 * @return boolean False if parse error.
475 function parse($raw) {
476 return $this->_lexer
->parse($raw);
480 * Sets up the matching lexer. Starts in 'text' mode.
481 * @param SimpleSaxParser $parser Event generator, usually $self.
482 * @return SimpleLexer Lexer suitable for this parser.
486 function &createLexer(&$parser) {
487 $lexer = &new SimpleLexer($parser, 'text');
488 $lexer->mapHandler('text', 'acceptTextToken');
489 SimpleSaxParser
::_addSkipping($lexer);
490 foreach (SimpleSaxParser
::_getParsedTags() as $tag) {
491 SimpleSaxParser
::_addTag($lexer, $tag);
493 SimpleSaxParser
::_addInTagTokens($lexer);
498 * List of parsed tags. Others are ignored.
499 * @return array List of searched for tags.
502 function _getParsedTags() {
503 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select',
504 'option', 'frameset', 'frame');
508 * The lexer has to skip certain sections such
509 * as server code, client code and styles.
510 * @param SimpleLexer $lexer Lexer to add patterns to.
514 function _addSkipping(&$lexer) {
515 $lexer->mapHandler('css', 'ignore');
516 $lexer->addEntryPattern('<style', 'text', 'css');
517 $lexer->addExitPattern('</style>', 'css');
518 $lexer->mapHandler('js', 'ignore');
519 $lexer->addEntryPattern('<script', 'text', 'js');
520 $lexer->addExitPattern('</script>', 'js');
521 $lexer->mapHandler('comment', 'ignore');
522 $lexer->addEntryPattern('<!--', 'text', 'comment');
523 $lexer->addExitPattern('-->', 'comment');
527 * Pattern matches to start and end a tag.
528 * @param SimpleLexer $lexer Lexer to add patterns to.
529 * @param string $tag Name of tag to scan for.
533 function _addTag(&$lexer, $tag) {
534 $lexer->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
535 $lexer->addEntryPattern("<$tag", 'text', 'tag');
539 * Pattern matches to parse the inside of a tag
540 * including the attributes and their quoting.
541 * @param SimpleLexer $lexer Lexer to add patterns to.
545 function _addInTagTokens(&$lexer) {
546 $lexer->mapHandler('tag', 'acceptStartToken');
547 $lexer->addSpecialPattern('\s+', 'tag', 'ignore');
548 SimpleSaxParser
::_addAttributeTokens($lexer);
549 $lexer->addExitPattern('>', 'tag');
553 * Matches attributes that are either single quoted,
554 * double quoted or unquoted.
555 * @param SimpleLexer $lexer Lexer to add patterns to.
559 function _addAttributeTokens(&$lexer) {
560 $lexer->mapHandler('dq_attribute', 'acceptAttributeToken');
561 $lexer->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
562 $lexer->addPattern("\\\\\"", 'dq_attribute');
563 $lexer->addExitPattern('"', 'dq_attribute');
564 $lexer->mapHandler('sq_attribute', 'acceptAttributeToken');
565 $lexer->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
566 $lexer->addPattern("\\\\'", 'sq_attribute');
567 $lexer->addExitPattern("'", 'sq_attribute');
568 $lexer->mapHandler('uq_attribute', 'acceptAttributeToken');
569 $lexer->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
573 * Accepts a token from the tag mode. If the
574 * starting element completes then the element
575 * is dispatched and the current attributes
576 * set back to empty. The element or attribute
577 * name is converted to lower case.
578 * @param string $token Incoming characters.
579 * @param integer $event Lexer event type.
580 * @return boolean False if parse error.
583 function acceptStartToken($token, $event) {
584 if ($event == LEXER_ENTER
) {
585 $this->_tag
= strtolower(substr($token, 1));
588 if ($event == LEXER_EXIT
) {
589 $success = $this->_listener
->startElement(
593 $this->_attributes
= array();
597 $this->_current_attribute
= strtolower($this->_decodeHtml($token));
598 $this->_attributes
[$this->_current_attribute
] = "";
604 * Accepts a token from the end tag mode.
605 * The element name is converted to lower case.
606 * @param string $token Incoming characters.
607 * @param integer $event Lexer event type.
608 * @return boolean False if parse error.
611 function acceptEndToken($token, $event) {
612 if (! preg_match('/<\/(.*)>/', $token, $matches)) {
615 return $this->_listener
->endElement(strtolower($matches[1]));
619 * Part of the tag data.
620 * @param string $token Incoming characters.
621 * @param integer $event Lexer event type.
622 * @return boolean False if parse error.
625 function acceptAttributeToken($token, $event) {
626 if ($event == LEXER_UNMATCHED
) {
627 $this->_attributes
[$this->_current_attribute
] .=
628 $this->_decodeHtml($token);
630 if ($event == LEXER_SPECIAL
) {
631 $this->_attributes
[$this->_current_attribute
] .=
632 preg_replace('/^=\s*/' , '', $this->_decodeHtml($token));
638 * A character entity.
639 * @param string $token Incoming characters.
640 * @param integer $event Lexer event type.
641 * @return boolean False if parse error.
644 function acceptEntityToken($token, $event) {
648 * Character data between tags regarded as
650 * @param string $token Incoming characters.
651 * @param integer $event Lexer event type.
652 * @return boolean False if parse error.
655 function acceptTextToken($token, $event) {
656 return $this->_listener
->addContent($token);
660 * Incoming data to be ignored.
661 * @param string $token Incoming characters.
662 * @param integer $event Lexer event type.
663 * @return boolean False if parse error.
666 function ignore($token, $event) {
671 * Decodes any HTML entities.
672 * @param string $html Incoming HTML.
673 * @return string Outgoing plain text.
676 function _decodeHtml($html) {
679 array_flip(get_html_translation_table(HTML_ENTITIES
)));
685 * @package SimpleTest
686 * @subpackage WebTester
689 class SimpleSaxListener
{
692 * Sets the document to write to.
695 function SimpleSaxListener() {
699 * Start of element event.
700 * @param string $name Element name.
701 * @param hash $attributes Name value pairs.
702 * Attributes without content
703 * are marked as true.
704 * @return boolean False on parse error.
707 function startElement($name, $attributes) {
711 * End of element event.
712 * @param string $name Element name.
713 * @return boolean False on parse error.
716 function endElement($name) {
720 * Unparsed, but relevant data.
721 * @param string $text May include unparsed tags.
722 * @return boolean False on parse error.
725 function addContent($text) {