make javascript smaller, and nicer
[platal.git] / htdocs / TESTS / simpletest / parser.php
1 <?php
2 /**
3 * base include file for SimpleTest
4 * @package SimpleTest
5 * @subpackage MockObjects
6 * @version $Id: parser.php,v 1.61 2004/08/11 16:14:37 lastcraft Exp $
7 */
8
9 /**#@+
10 * Lexer mode stack constants
11 */
12 define("LEXER_ENTER", 1);
13 define("LEXER_MATCHED", 2);
14 define("LEXER_UNMATCHED", 3);
15 define("LEXER_EXIT", 4);
16 define("LEXER_SPECIAL", 5);
17 /**#@-*/
18
19 /**
20 * Compounded regular expression. Any of
21 * the contained patterns could match and
22 * when one does it's label is returned.
23 * @package SimpleTest
24 * @subpackage WebTester
25 */
26 class ParallelRegex {
27 var $_patterns;
28 var $_labels;
29 var $_regex;
30 var $_case;
31
32 /**
33 * Constructor. Starts with no patterns.
34 * @param boolean $case True for case sensitive, false
35 * for insensitive.
36 * @access public
37 */
38 function ParallelRegex($case) {
39 $this->_case = $case;
40 $this->_patterns = array();
41 $this->_labels = array();
42 $this->_regex = null;
43 }
44
45 /**
46 * Adds a pattern with an optional label.
47 * @param string $pattern Perl style regex, but ( and )
48 * lose the usual meaning.
49 * @param string $label Label of regex to be returned
50 * on a match.
51 * @access public
52 */
53 function addPattern($pattern, $label = true) {
54 $count = count($this->_patterns);
55 $this->_patterns[$count] = $pattern;
56 $this->_labels[$count] = $label;
57 $this->_regex = null;
58 }
59
60 /**
61 * Attempts to match all patterns at once against
62 * a string.
63 * @param string $subject String to match against.
64 * @param string $match First matched portion of
65 * subject.
66 * @return boolean True on success.
67 * @access public
68 */
69 function match($subject, &$match) {
70 if (count($this->_patterns) == 0) {
71 return false;
72 }
73 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
74 $match = '';
75 return false;
76 }
77 $match = $matches[0];
78 for ($i = 1; $i < count($matches); $i++) {
79 if ($matches[$i]) {
80 return $this->_labels[$i - 1];
81 }
82 }
83 return true;
84 }
85
86 /**
87 * Compounds the patterns into a single
88 * regular expression separated with the
89 * "or" operator. Caches the regex.
90 * Will automatically escape (, ) and / tokens.
91 * @param array $patterns List of patterns in order.
92 * @access private
93 */
94 function _getCompoundedRegex() {
95 if ($this->_regex == null) {
96 for ($i = 0; $i < count($this->_patterns); $i++) {
97 $this->_patterns[$i] = '(' . str_replace(
98 array('/', '(', ')'),
99 array('\/', '\(', '\)'),
100 $this->_patterns[$i]) . ')';
101 }
102 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
103 }
104 return $this->_regex;
105 }
106
107 /**
108 * Accessor for perl regex mode flags to use.
109 * @return string Perl regex flags.
110 * @access private
111 */
112 function _getPerlMatchingFlags() {
113 return ($this->_case ? "msS" : "msSi");
114 }
115 }
116
117 /**
118 * States for a stack machine.
119 * @package SimpleTest
120 * @subpackage WebTester
121 */
122 class SimpleStateStack {
123 var $_stack;
124
125 /**
126 * Constructor. Starts in named state.
127 * @param string $start Starting state name.
128 * @access public
129 */
130 function SimpleStateStack($start) {
131 $this->_stack = array($start);
132 }
133
134 /**
135 * Accessor for current state.
136 * @return string State.
137 * @access public
138 */
139 function getCurrent() {
140 return $this->_stack[count($this->_stack) - 1];
141 }
142
143 /**
144 * Adds a state to the stack and sets it
145 * to be the current state.
146 * @param string $state New state.
147 * @access public
148 */
149 function enter($state) {
150 array_push($this->_stack, $state);
151 }
152
153 /**
154 * Leaves the current state and reverts
155 * to the previous one.
156 * @return boolean False if we drop off
157 * the bottom of the list.
158 * @access public
159 */
160 function leave() {
161 if (count($this->_stack) == 1) {
162 return false;
163 }
164 array_pop($this->_stack);
165 return true;
166 }
167 }
168
169 /**
170 * Accepts text and breaks it into tokens.
171 * Some optimisation to make the sure the
172 * content is only scanned by the PHP regex
173 * parser once. Lexer modes must not start
174 * with leading underscores.
175 * @package SimpleTest
176 * @subpackage WebTester
177 */
178 class SimpleLexer {
179 var $_regexes;
180 var $_parser;
181 var $_mode;
182 var $_mode_handlers;
183 var $_case;
184
185 /**
186 * Sets up the lexer in case insensitive matching
187 * by default.
188 * @param SimpleSaxParser $parser Handling strategy by
189 * reference.
190 * @param string $start Starting handler.
191 * @param boolean $case True for case sensitive.
192 * @access public
193 */
194 function SimpleLexer(&$parser, $start = "accept", $case = false) {
195 $this->_case = $case;
196 $this->_regexes = array();
197 $this->_parser = &$parser;
198 $this->_mode = &new SimpleStateStack($start);
199 $this->_mode_handlers = array($start => $start);
200 }
201
202 /**
203 * Adds a token search pattern for a particular
204 * parsing mode. The pattern does not change the
205 * current mode.
206 * @param string $pattern Perl style regex, but ( and )
207 * lose the usual meaning.
208 * @param string $mode Should only apply this
209 * pattern when dealing with
210 * this type of input.
211 * @access public
212 */
213 function addPattern($pattern, $mode = "accept") {
214 if (! isset($this->_regexes[$mode])) {
215 $this->_regexes[$mode] = new ParallelRegex($this->_case);
216 }
217 $this->_regexes[$mode]->addPattern($pattern);
218 if (! isset($this->_mode_handlers[$mode])) {
219 $this->_mode_handlers[$mode] = $mode;
220 }
221 }
222
223 /**
224 * Adds a pattern that will enter a new parsing
225 * mode. Useful for entering parenthesis, strings,
226 * tags, etc.
227 * @param string $pattern Perl style regex, but ( and )
228 * lose the usual meaning.
229 * @param string $mode Should only apply this
230 * pattern when dealing with
231 * this type of input.
232 * @param string $new_mode Change parsing to this new
233 * nested mode.
234 * @access public
235 */
236 function addEntryPattern($pattern, $mode, $new_mode) {
237 if (! isset($this->_regexes[$mode])) {
238 $this->_regexes[$mode] = new ParallelRegex($this->_case);
239 }
240 $this->_regexes[$mode]->addPattern($pattern, $new_mode);
241 if (! isset($this->_mode_handlers[$new_mode])) {
242 $this->_mode_handlers[$new_mode] = $new_mode;
243 }
244 }
245
246 /**
247 * Adds a pattern that will exit the current mode
248 * and re-enter the previous one.
249 * @param string $pattern Perl style regex, but ( and )
250 * lose the usual meaning.
251 * @param string $mode Mode to leave.
252 * @access public
253 */
254 function addExitPattern($pattern, $mode) {
255 if (! isset($this->_regexes[$mode])) {
256 $this->_regexes[$mode] = new ParallelRegex($this->_case);
257 }
258 $this->_regexes[$mode]->addPattern($pattern, "__exit");
259 if (! isset($this->_mode_handlers[$mode])) {
260 $this->_mode_handlers[$mode] = $mode;
261 }
262 }
263
264 /**
265 * Adds a pattern that has a special mode. Acts as an entry
266 * and exit pattern in one go, effectively calling a special
267 * parser handler for this token only.
268 * @param string $pattern Perl style regex, but ( and )
269 * lose the usual meaning.
270 * @param string $mode Should only apply this
271 * pattern when dealing with
272 * this type of input.
273 * @param string $special Use this mode for this one token.
274 * @access public
275 */
276 function addSpecialPattern($pattern, $mode, $special) {
277 if (! isset($this->_regexes[$mode])) {
278 $this->_regexes[$mode] = new ParallelRegex($this->_case);
279 }
280 $this->_regexes[$mode]->addPattern($pattern, "_$special");
281 if (! isset($this->_mode_handlers[$special])) {
282 $this->_mode_handlers[$special] = $special;
283 }
284 }
285
286 /**
287 * Adds a mapping from a mode to another handler.
288 * @param string $mode Mode to be remapped.
289 * @param string $handler New target handler.
290 * @access public
291 */
292 function mapHandler($mode, $handler) {
293 $this->_mode_handlers[$mode] = $handler;
294 }
295
296 /**
297 * Splits the page text into tokens. Will fail
298 * if the handlers report an error or if no
299 * content is consumed. If successful then each
300 * unparsed and parsed token invokes a call to the
301 * held listener.
302 * @param string $raw Raw HTML text.
303 * @return boolean True on success, else false.
304 * @access public
305 */
306 function parse($raw) {
307 if (! isset($this->_parser)) {
308 return false;
309 }
310 $length = strlen($raw);
311 while (is_array($parsed = $this->_reduce($raw))) {
312 list($raw, $unmatched, $matched, $mode) = $parsed;
313 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
314 return false;
315 }
316 if ($raw === '') {
317 return true;
318 }
319 if (strlen($raw) == $length) {
320 return false;
321 }
322 $length = strlen($raw);
323 }
324 if (! $parsed) {
325 return false;
326 }
327 return $this->_invokeParser($raw, LEXER_UNMATCHED);
328 }
329
330 /**
331 * Sends the matched token and any leading unmatched
332 * text to the parser changing the lexer to a new
333 * mode if one is listed.
334 * @param string $unmatched Unmatched leading portion.
335 * @param string $matched Actual token match.
336 * @param string $mode Mode after match. A boolean
337 * false mode causes no change.
338 * @return boolean False if there was any error
339 * from the parser.
340 * @access private
341 */
342 function _dispatchTokens($unmatched, $matched, $mode = false) {
343 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
344 return false;
345 }
346 if (is_bool($mode)) {
347 return $this->_invokeParser($matched, LEXER_MATCHED);
348 }
349 if ($this->_isModeEnd($mode)) {
350 if (! $this->_invokeParser($matched, LEXER_EXIT)) {
351 return false;
352 }
353 return $this->_mode->leave();
354 }
355 if ($this->_isSpecialMode($mode)) {
356 $this->_mode->enter($this->_decodeSpecial($mode));
357 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) {
358 return false;
359 }
360 return $this->_mode->leave();
361 }
362 $this->_mode->enter($mode);
363 return $this->_invokeParser($matched, LEXER_ENTER);
364 }
365
366 /**
367 * Tests to see if the new mode is actually to leave
368 * the current mode and pop an item from the matching
369 * mode stack.
370 * @param string $mode Mode to test.
371 * @return boolean True if this is the exit mode.
372 * @access private
373 */
374 function _isModeEnd($mode) {
375 return ($mode === "__exit");
376 }
377
378 /**
379 * Test to see if the mode is one where this mode
380 * is entered for this token only and automatically
381 * leaves immediately afterwoods.
382 * @param string $mode Mode to test.
383 * @return boolean True if this is the exit mode.
384 * @access private
385 */
386 function _isSpecialMode($mode) {
387 return (strncmp($mode, "_", 1) == 0);
388 }
389
390 /**
391 * Strips the magic underscore marking single token
392 * modes.
393 * @param string $mode Mode to decode.
394 * @return string Underlying mode name.
395 * @access private
396 */
397 function _decodeSpecial($mode) {
398 return substr($mode, 1);
399 }
400
401 /**
402 * Calls the parser method named after the current
403 * mode. Empty content will be ignored. The lexer
404 * has a parser handler for each mode in the lexer.
405 * @param string $content Text parsed.
406 * @param boolean $is_match Token is recognised rather
407 * than unparsed data.
408 * @access private
409 */
410 function _invokeParser($content, $is_match) {
411 if (($content === '') || ($content === false)) {
412 return true;
413 }
414 $handler = $this->_mode_handlers[$this->_mode->getCurrent()];
415 return $this->_parser->$handler($content, $is_match);
416 }
417
418 /**
419 * Tries to match a chunk of text and if successful
420 * removes the recognised chunk and any leading
421 * unparsed data. Empty strings will not be matched.
422 * @param string $raw The subject to parse. This is the
423 * content that will be eaten.
424 * @return array Three item list of unparsed
425 * content followed by the
426 * recognised token and finally the
427 * action the parser is to take.
428 * True if no match, false if there
429 * is a parsing error.
430 * @access private
431 */
432 function _reduce($raw) {
433 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
434 $unparsed_character_count = strpos($raw, $match);
435 $unparsed = substr($raw, 0, $unparsed_character_count);
436 $raw = substr($raw, $unparsed_character_count + strlen($match));
437 return array($raw, $unparsed, $match, $action);
438 }
439 return true;
440 }
441 }
442
443 /**
444 * Converts HTML tokens into selected SAX events.
445 * @package SimpleTest
446 * @subpackage WebTester
447 */
448 class SimpleSaxParser {
449 var $_lexer;
450 var $_listener;
451 var $_tag;
452 var $_attributes;
453 var $_current_attribute;
454
455 /**
456 * Sets the listener.
457 * @param SimpleSaxListener $listener SAX event handler.
458 * @access public
459 */
460 function SimpleSaxParser(&$listener) {
461 $this->_listener = &$listener;
462 $this->_lexer = &$this->createLexer($this);
463 $this->_tag = '';
464 $this->_attributes = array();
465 $this->_current_attribute = '';
466 }
467
468 /**
469 * Runs the content through the lexer which
470 * should call back to the acceptors.
471 * @param string $raw Page text to parse.
472 * @return boolean False if parse error.
473 * @access public
474 */
475 function parse($raw) {
476 return $this->_lexer->parse($raw);
477 }
478
479 /**
480 * Sets up the matching lexer. Starts in 'text' mode.
481 * @param SimpleSaxParser $parser Event generator, usually $self.
482 * @return SimpleLexer Lexer suitable for this parser.
483 * @access public
484 * @static
485 */
486 function &createLexer(&$parser) {
487 $lexer = &new SimpleLexer($parser, 'text');
488 $lexer->mapHandler('text', 'acceptTextToken');
489 SimpleSaxParser::_addSkipping($lexer);
490 foreach (SimpleSaxParser::_getParsedTags() as $tag) {
491 SimpleSaxParser::_addTag($lexer, $tag);
492 }
493 SimpleSaxParser::_addInTagTokens($lexer);
494 return $lexer;
495 }
496
497 /**
498 * List of parsed tags. Others are ignored.
499 * @return array List of searched for tags.
500 * @access private
501 */
502 function _getParsedTags() {
503 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select',
504 'option', 'frameset', 'frame');
505 }
506
507 /**
508 * The lexer has to skip certain sections such
509 * as server code, client code and styles.
510 * @param SimpleLexer $lexer Lexer to add patterns to.
511 * @access private
512 * @static
513 */
514 function _addSkipping(&$lexer) {
515 $lexer->mapHandler('css', 'ignore');
516 $lexer->addEntryPattern('<style', 'text', 'css');
517 $lexer->addExitPattern('</style>', 'css');
518 $lexer->mapHandler('js', 'ignore');
519 $lexer->addEntryPattern('<script', 'text', 'js');
520 $lexer->addExitPattern('</script>', 'js');
521 $lexer->mapHandler('comment', 'ignore');
522 $lexer->addEntryPattern('<!--', 'text', 'comment');
523 $lexer->addExitPattern('-->', 'comment');
524 }
525
526 /**
527 * Pattern matches to start and end a tag.
528 * @param SimpleLexer $lexer Lexer to add patterns to.
529 * @param string $tag Name of tag to scan for.
530 * @access private
531 * @static
532 */
533 function _addTag(&$lexer, $tag) {
534 $lexer->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
535 $lexer->addEntryPattern("<$tag", 'text', 'tag');
536 }
537
538 /**
539 * Pattern matches to parse the inside of a tag
540 * including the attributes and their quoting.
541 * @param SimpleLexer $lexer Lexer to add patterns to.
542 * @access private
543 * @static
544 */
545 function _addInTagTokens(&$lexer) {
546 $lexer->mapHandler('tag', 'acceptStartToken');
547 $lexer->addSpecialPattern('\s+', 'tag', 'ignore');
548 SimpleSaxParser::_addAttributeTokens($lexer);
549 $lexer->addExitPattern('>', 'tag');
550 }
551
552 /**
553 * Matches attributes that are either single quoted,
554 * double quoted or unquoted.
555 * @param SimpleLexer $lexer Lexer to add patterns to.
556 * @access private
557 * @static
558 */
559 function _addAttributeTokens(&$lexer) {
560 $lexer->mapHandler('dq_attribute', 'acceptAttributeToken');
561 $lexer->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
562 $lexer->addPattern("\\\\\"", 'dq_attribute');
563 $lexer->addExitPattern('"', 'dq_attribute');
564 $lexer->mapHandler('sq_attribute', 'acceptAttributeToken');
565 $lexer->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
566 $lexer->addPattern("\\\\'", 'sq_attribute');
567 $lexer->addExitPattern("'", 'sq_attribute');
568 $lexer->mapHandler('uq_attribute', 'acceptAttributeToken');
569 $lexer->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
570 }
571
572 /**
573 * Accepts a token from the tag mode. If the
574 * starting element completes then the element
575 * is dispatched and the current attributes
576 * set back to empty. The element or attribute
577 * name is converted to lower case.
578 * @param string $token Incoming characters.
579 * @param integer $event Lexer event type.
580 * @return boolean False if parse error.
581 * @access public
582 */
583 function acceptStartToken($token, $event) {
584 if ($event == LEXER_ENTER) {
585 $this->_tag = strtolower(substr($token, 1));
586 return true;
587 }
588 if ($event == LEXER_EXIT) {
589 $success = $this->_listener->startElement(
590 $this->_tag,
591 $this->_attributes);
592 $this->_tag = "";
593 $this->_attributes = array();
594 return $success;
595 }
596 if ($token != "=") {
597 $this->_current_attribute = strtolower($this->_decodeHtml($token));
598 $this->_attributes[$this->_current_attribute] = "";
599 }
600 return true;
601 }
602
603 /**
604 * Accepts a token from the end tag mode.
605 * The element name is converted to lower case.
606 * @param string $token Incoming characters.
607 * @param integer $event Lexer event type.
608 * @return boolean False if parse error.
609 * @access public
610 */
611 function acceptEndToken($token, $event) {
612 if (! preg_match('/<\/(.*)>/', $token, $matches)) {
613 return false;
614 }
615 return $this->_listener->endElement(strtolower($matches[1]));
616 }
617
618 /**
619 * Part of the tag data.
620 * @param string $token Incoming characters.
621 * @param integer $event Lexer event type.
622 * @return boolean False if parse error.
623 * @access public
624 */
625 function acceptAttributeToken($token, $event) {
626 if ($event == LEXER_UNMATCHED) {
627 $this->_attributes[$this->_current_attribute] .=
628 $this->_decodeHtml($token);
629 }
630 if ($event == LEXER_SPECIAL) {
631 $this->_attributes[$this->_current_attribute] .=
632 preg_replace('/^=\s*/' , '', $this->_decodeHtml($token));
633 }
634 return true;
635 }
636
637 /**
638 * A character entity.
639 * @param string $token Incoming characters.
640 * @param integer $event Lexer event type.
641 * @return boolean False if parse error.
642 * @access public
643 */
644 function acceptEntityToken($token, $event) {
645 }
646
647 /**
648 * Character data between tags regarded as
649 * important.
650 * @param string $token Incoming characters.
651 * @param integer $event Lexer event type.
652 * @return boolean False if parse error.
653 * @access public
654 */
655 function acceptTextToken($token, $event) {
656 return $this->_listener->addContent($token);
657 }
658
659 /**
660 * Incoming data to be ignored.
661 * @param string $token Incoming characters.
662 * @param integer $event Lexer event type.
663 * @return boolean False if parse error.
664 * @access public
665 */
666 function ignore($token, $event) {
667 return true;
668 }
669
670 /**
671 * Decodes any HTML entities.
672 * @param string $html Incoming HTML.
673 * @return string Outgoing plain text.
674 * @access private
675 */
676 function _decodeHtml($html) {
677 return strtr(
678 $html,
679 array_flip(get_html_translation_table(HTML_ENTITIES)));
680 }
681 }
682
683 /**
684 * SAX event handler.
685 * @package SimpleTest
686 * @subpackage WebTester
687 * @abstract
688 */
689 class SimpleSaxListener {
690
691 /**
692 * Sets the document to write to.
693 * @access public
694 */
695 function SimpleSaxListener() {
696 }
697
698 /**
699 * Start of element event.
700 * @param string $name Element name.
701 * @param hash $attributes Name value pairs.
702 * Attributes without content
703 * are marked as true.
704 * @return boolean False on parse error.
705 * @access public
706 */
707 function startElement($name, $attributes) {
708 }
709
710 /**
711 * End of element event.
712 * @param string $name Element name.
713 * @return boolean False on parse error.
714 * @access public
715 */
716 function endElement($name) {
717 }
718
719 /**
720 * Unparsed, but relevant data.
721 * @param string $text May include unparsed tags.
722 * @return boolean False on parse error.
723 * @access public
724 */
725 function addContent($text) {
726 }
727 }
728 ?>