2 /* vim: set expandtab tabstop=4 shiftwidth=4: */
4 // +----------------------------------------------------------------------+
6 // +----------------------------------------------------------------------+
7 // | Copyright (c) 1997-2002 The PHP Group |
8 // +----------------------------------------------------------------------+
9 // | This source file is subject to version 2.02 of the PHP license, |
10 // | that is bundled with this package in the file LICENSE, and is |
11 // | available at through the world-wide-web at |
12 // | http://www.php.net/license/3_0.txt. |
13 // | If you did not receive a copy of the PHP license and are unable to |
14 // | obtain it through the world-wide-web, please send a note to |
15 // | license@php.net so we can mail you a copy immediately. |
16 // +----------------------------------------------------------------------+
17 // | Authors: Alexander Zhukov <alex@veresk.ru> Original port from Python |
18 // | Authors: Harry Fuecks <hfuecks@phppatterns.com> Port to PEAR + more |
19 // | Authors: Many @ Sitepointforums Advanced PHP Forums |
20 // +----------------------------------------------------------------------+
22 // $Id: HTMLSax3.php 3188 2012-07-12 12:13:23Z ctrlaltca $
25 * Main parser components
26 * @package System.Security.SafeHtml
32 require_once(dirname(__FILE__).'/HTMLSax3/States.php');
33 require_once(dirname(__FILE__).'/HTMLSax3/Decorators.php');
37 * @package System.Security.SafeHtml
41 class TSax3_StateParser {
43 * Instance of user front end class to be passed to callbacks
49 * User defined object for handling elements
53 public $handler_object_element;
55 * User defined open tag handler method
59 public $handler_method_opening;
61 * User defined close tag handler method
65 public $handler_method_closing;
67 * User defined object for handling data in elements
71 public $handler_object_data;
73 * User defined data handler method
77 public $handler_method_data;
79 * User defined object for handling processing instructions
83 public $handler_object_pi;
85 * User defined processing instruction handler method
89 public $handler_method_pi;
91 * User defined object for handling JSP/ASP tags
95 public $handler_object_jasp;
97 * User defined JSP/ASP handler method
101 public $handler_method_jasp;
103 * User defined object for handling XML escapes
107 public $handler_object_escape;
109 * User defined XML escape handler method
113 public $handler_method_escape;
115 * User defined handler object or NullHandler
119 public $handler_default;
121 * Parser options determining parsing behavior
125 protected $parser_options = array();
127 * XML document being parsed
133 * Position in XML document relative to start (0)
139 * Length of the XML document in characters
145 * Array of state objects
149 protected $State = array();
151 const TSAX3_STATE_STOP = 0;
152 const TSAX3_STATE_START = 1;
153 const TSAX3_STATE_TAG = 2;
154 const TSAX3_STATE_OPENING_TAG = 3;
155 const TSAX3_STATE_CLOSING_TAG = 4;
156 const TSAX3_STATE_ESCAPE = 6;
157 const TSAX3_STATE_JASP = 7;
158 const TSAX3_STATE_PI = 8;
161 * Constructs TSax3_StateParser setting up states
162 * @var TSax3 instance of user front end class
165 protected function __construct($htmlsax) {
166 $this->htmlsax = $htmlsax;
167 $this->State[self::TSAX3_STATE_START] = new TSax3_StartingState();
169 $this->State[self::TSAX3_STATE_CLOSING_TAG] = new TSax3_ClosingTagState();
170 $this->State[self::TSAX3_STATE_TAG] = new TSax3_TagState();
171 $this->State[self::TSAX3_STATE_OPENING_TAG] = new TSax3_OpeningTagState();
173 $this->State[self::TSAX3_STATE_PI] = new TSax3_PiState();
174 $this->State[self::TSAX3_STATE_JASP] = new TSax3_JaspState();
175 $this->State[self::TSAX3_STATE_ESCAPE] = new TSax3_EscapeState();
179 * Moves the position back one character
183 function unscanCharacter() {
184 $this->position -= 1;
188 * Moves the position forward one character
192 function ignoreCharacter() {
193 $this->position += 1;
197 * Returns the next character from the XML document or void if at end
201 function scanCharacter() {
202 if ($this->position < $this->length) {
203 return $this->rawtext{$this->position++};
208 * Returns a string from the current position to the next occurance
209 * of the supplied string
210 * @param string string to search until
214 function scanUntilString($string) {
215 $start = $this->position;
216 $this->position = strpos($this->rawtext, $string, $start);
217 if ($this->position === FALSE) {
218 $this->position = $this->length;
220 return substr($this->rawtext, $start, $this->position - $start);
224 * Returns a string from the current position until the first instance of
225 * one of the characters in the supplied string argument
226 * @param string string to search until
231 function scanUntilCharacters($string) {}
234 * Moves the position forward past any whitespace characters
239 function ignoreWhitespace() {}
242 * Begins the parsing operation, setting up any decorators, depending on
243 * parse options invoking _parse() to execute parsing
244 * @param string XML document to parse
248 function parse($data) {
249 if ($this->parser_options['XML_OPTION_TRIM_DATA_NODES']==1) {
250 $decorator = new TSax3_Trim(
251 $this->handler_object_data,
252 $this->handler_method_data);
253 $this->handler_object_data =& $decorator;
254 $this->handler_method_data = 'trimData';
256 if ($this->parser_options['XML_OPTION_CASE_FOLDING']==1) {
257 $open_decor = new TSax3_CaseFolding(
258 $this->handler_object_element,
259 $this->handler_method_opening,
260 $this->handler_method_closing);
261 $this->handler_object_element =& $open_decor;
262 $this->handler_method_opening ='foldOpen';
263 $this->handler_method_closing ='foldClose';
265 if ($this->parser_options['XML_OPTION_LINEFEED_BREAK']==1) {
266 $decorator = new TSax3_Linefeed(
267 $this->handler_object_data,
268 $this->handler_method_data);
269 $this->handler_object_data =& $decorator;
270 $this->handler_method_data = 'breakData';
272 if ($this->parser_options['XML_OPTION_TAB_BREAK']==1) {
273 $decorator = new TSax3_Tab(
274 $this->handler_object_data,
275 $this->handler_method_data);
276 $this->handler_object_data =& $decorator;
277 $this->handler_method_data = 'breakData';
279 if ($this->parser_options['XML_OPTION_ENTITIES_UNPARSED']==1) {
280 $decorator = new TSax3_Entities_Unparsed(
281 $this->handler_object_data,
282 $this->handler_method_data);
283 $this->handler_object_data =& $decorator;
284 $this->handler_method_data = 'breakData';
286 if ($this->parser_options['XML_OPTION_ENTITIES_PARSED']==1) {
287 $decorator = new TSax3_Entities_Parsed(
288 $this->handler_object_data,
289 $this->handler_method_data);
290 $this->handler_object_data =& $decorator;
291 $this->handler_method_data = 'breakData';
293 // Note switched on by default
294 if ($this->parser_options['XML_OPTION_STRIP_ESCAPES']==1) {
295 $decorator = new TSax3_Escape_Stripper(
296 $this->handler_object_escape,
297 $this->handler_method_escape);
298 $this->handler_object_escape =& $decorator;
299 $this->handler_method_escape = 'strip';
301 $this->rawtext = $data;
302 $this->length = strlen($data);
308 * Performs the parsing itself, delegating calls to a specific parser
310 * @param constant state object to parse with
314 function _parse($state = self::TSAX3_STATE_START) {
316 $state = $this->State[$state]->parse($this);
317 } while ($state != self::TSAX3_STATE_STOP &&
318 $this->position < $this->length);
323 * Parser for PHP Versions below 4.3.0. Uses a slower parsing mechanism than
324 * the equivalent PHP 4.3.0+ subclass of StateParser
325 * @package System.Security.SafeHtml
327 * @see TSax3_StateParser_Gtet430
329 class TSax3_StateParser_Lt430 extends TSax3_StateParser {
331 * Constructs TSax3_StateParser_Lt430 defining available
333 * @var TSax3 instance of user front end class
336 function __construct(& $htmlsax) {
337 parent::__construct($htmlsax);
338 $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0;
339 $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0;
340 $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0;
341 $this->parser_options['XML_OPTION_TAB_BREAK'] = 0;
342 $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0;
343 $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0;
344 $this->parser_options['XML_OPTION_STRIP_ESCAPES'] = 0;
345 //var_dump($this->parser_options);
349 * Returns a string from the current position until the first instance of
350 * one of the characters in the supplied string argument
351 * @param string string to search until
355 function scanUntilCharacters($string) {
356 $startpos = $this->position;
357 while ($this->position < $this->length && strpos($string, $this->rawtext{$this->position}) === FALSE) {
360 return substr($this->rawtext, $startpos, $this->position - $startpos);
364 * Moves the position forward past any whitespace characters
368 function ignoreWhitespace() {
369 while ($this->position < $this->length &&
370 strpos(" \n\r\t", $this->rawtext{$this->position}) !== FALSE) {
376 * Begins the parsing operation, setting up the unparsed XML entities
377 * decorator if necessary then delegating further work to parent
378 * @param string XML document to parse
382 function parse($data) {
383 parent::parse($data);
388 * Parser for PHP Versions equal to or greater than 4.3.0. Uses a faster
389 * parsing mechanism than the equivalent PHP < 4.3.0 subclass of StateParser
390 * @package System.Security.SafeHtml
392 * @see TSax3_StateParser_Lt430
394 class TSax3_StateParser_Gtet430 extends TSax3_StateParser {
396 * Constructs TSax3_StateParser_Gtet430 defining available
398 * @var TSax3 instance of user front end class
401 function __construct(& $htmlsax) {
402 parent::__construct($htmlsax);
403 $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0;
404 $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0;
405 $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0;
406 $this->parser_options['XML_OPTION_TAB_BREAK'] = 0;
407 $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0;
408 $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0;
409 $this->parser_options['XML_OPTION_STRIP_ESCAPES'] = 0;
412 * Returns a string from the current position until the first instance of
413 * one of the characters in the supplied string argument.
414 * @param string string to search until
418 function scanUntilCharacters($string) {
419 $startpos = $this->position;
420 $length = strcspn($this->rawtext, $string, $startpos);
421 $this->position += $length;
422 return substr($this->rawtext, $startpos, $length);
426 * Moves the position forward past any whitespace characters
430 function ignoreWhitespace() {
431 $this->position += strspn($this->rawtext, " \n\r\t", $this->position);
435 * Begins the parsing operation, setting up the parsed and unparsed
436 * XML entity decorators if necessary then delegating further work
438 * @param string XML document to parse
442 function parse($data) {
443 parent::parse($data);
448 * Default NullHandler for methods which were not set by user
449 * @package System.Security.SafeHtml
452 class TSax3_NullHandler {
454 * Generic handler method which does nothing
458 function DoNothing() {
463 * User interface class. All user calls should only be made to this class
464 * @package System.Security.SafeHtml
469 * Instance of concrete subclass of TSax3_StateParser
470 * @var TSax3_StateParser
473 private $state_parser;
476 * Constructs TSax3 selecting concrete StateParser subclass
477 * depending on PHP version being used as well as setting the default
478 * NullHandler for all callbacks<br />
481 * $myHandler = & new MyHandler();
482 * $parser = new TSax3();
483 * $parser->set_object($myHandler);
484 * $parser->set_option('XML_OPTION_CASE_FOLDING');
485 * $parser->set_element_handler('myOpenHandler','myCloseHandler');
486 * $parser->set_data_handler('myDataHandler');
487 * $parser->parser($xml);
491 function __construct() {
492 if (version_compare(phpversion(), '4.3', 'ge')) {
493 $this->state_parser = new TSax3_StateParser_Gtet430($this);
495 $this->state_parser = new TSax3_StateParser_Lt430($this);
497 $nullhandler = new TSax3_NullHandler();
498 $this->set_object($nullhandler);
499 $this->set_element_handler('DoNothing', 'DoNothing');
500 $this->set_data_handler('DoNothing');
501 $this->set_pi_handler('DoNothing');
502 $this->set_jasp_handler('DoNothing');
503 $this->set_escape_handler('DoNothing');
507 * Sets the user defined handler object. Returns a PEAR Error
508 * if supplied argument is not an object.
509 * @param object handler object containing SAX callback methods
513 function set_object(&$object) {
514 if ( is_object($object) ) {
515 $this->state_parser->handler_default =& $object;
518 require_once('PEAR.php');
519 PEAR::raiseError('TSax3::set_object requires '.
520 'an object instance');
525 * Sets a parser option. By default all options are switched off.
526 * Returns a PEAR Error if option is invalid<br />
527 * <b>Available options:</b>
529 * <li>XML_OPTION_TRIM_DATA_NODES: trim whitespace off the beginning
530 * and end of data passed to the data handler</li>
531 * <li>XML_OPTION_LINEFEED_BREAK: linefeeds result in additional data
533 * <li>XML_OPTION_TAB_BREAK: tabs result in additional data handler
535 * <li>XML_OPTION_ENTITIES_UNPARSED: XML entities are returned as
536 * seperate data handler calls in unparsed form</li>
537 * <li>XML_OPTION_ENTITIES_PARSED: (PHP 4.3.0+ only) XML entities are
538 * returned as seperate data handler calls and are parsed with
539 * PHP's html_entity_decode() function</li>
540 * <li>XML_OPTION_STRIP_ESCAPES: strips out the -- -- comment markers
541 * or CDATA markup inside an XML escape, if found.</li>
543 * To get HTMLSax to behave in the same way as the native PHP SAX parser,
544 * using it's default state, you need to switch on XML_OPTION_LINEFEED_BREAK,
545 * XML_OPTION_ENTITIES_PARSED and XML_OPTION_CASE_FOLDING
546 * @param string name of parser option
547 * @param int (optional) 1 to switch on, 0 for off
551 function set_option($name, $value=1) {
552 if ( array_key_exists($name,$this->state_parser->parser_options) ) {
553 $this->state_parser->parser_options[$name] = $value;
556 require_once('PEAR.php');
557 PEAR::raiseError('TSax3::set_option('.$name.') illegal');
562 * Sets the data handler method which deals with the contents of XML
564 * The handler method must accept two arguments, the first being an
565 * instance of TSax3 and the second being the contents of an
568 * function myDataHander(& $parser,$data){}
570 * @param string name of method
575 function set_data_handler($data_method) {
576 $this->state_parser->handler_object_data =& $this->state_parser->handler_default;
577 $this->state_parser->handler_method_data = $data_method;
581 * Sets the open and close tag handlers
582 * <br />The open handler method must accept three arguments; the parser,
583 * the tag name and an array of attributes e.g.
585 * function myOpenHander(& $parser,$tagname,$attrs=array()){}
587 * The close handler method must accept two arguments; the parser and
590 * function myCloseHander(& $parser,$tagname){}
592 * @param string name of open method
593 * @param string name of close method
598 function set_element_handler($opening_method, $closing_method) {
599 $this->state_parser->handler_object_element =& $this->state_parser->handler_default;
600 $this->state_parser->handler_method_opening = $opening_method;
601 $this->state_parser->handler_method_closing = $closing_method;
605 * Sets the processing instruction handler method e.g. for PHP open
606 * and close tags<br />
607 * The handler method must accept three arguments; the parser, the
608 * PI target and data inside the PI
610 * function myPIHander(& $parser,$target, $data){}
612 * @param string name of method
617 function set_pi_handler($pi_method) {
618 $this->state_parser->handler_object_pi =& $this->state_parser->handler_default;
619 $this->state_parser->handler_method_pi = $pi_method;
623 * Sets the XML escape handler method e.g. for comments and doctype
625 * The handler method must accept two arguments; the parser and the
626 * contents of the escaped section
628 * function myEscapeHander(& $parser, $data){}
630 * @param string name of method
635 function set_escape_handler($escape_method) {
636 $this->state_parser->handler_object_escape =& $this->state_parser->handler_default;
637 $this->state_parser->handler_method_escape = $escape_method;
641 * Sets the JSP/ASP markup handler<br />
642 * The handler method must accept two arguments; the parser and
643 * body of the JASP tag
645 * function myJaspHander(& $parser, $data){}
647 * @param string name of method
652 function set_jasp_handler ($jasp_method) {
653 $this->state_parser->handler_object_jasp =& $this->state_parser->handler_default;
654 $this->state_parser->handler_method_jasp = $jasp_method;
658 * Returns the current string position of the "cursor" inside the XML
660 * <br />Intended for use from within a user defined handler called
661 * via the $parser reference e.g.
663 * function myDataHandler(& $parser,$data) {
664 * echo( 'Current position: '.$parser->get_current_position() );
671 function get_current_position() {
672 return $this->state_parser->position;
676 * Returns the string length of the XML document being parsed
680 function get_length() {
681 return $this->state_parser->length;
685 * Start parsing some XML
686 * @param string XML document
690 function parse($data) {
691 $this->state_parser->parse($data);