350 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			350 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /**
 | |
|  * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
 | |
|  * For an intro to the Lexer see:
 | |
|  * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
 | |
|  *
 | |
|  * @author Marcus Baker http://www.lastcraft.com
 | |
|  */
 | |
| 
 | |
| namespace dokuwiki\Parsing\Lexer;
 | |
| 
 | |
| /**
 | |
|  * Accepts text and breaks it into tokens.
 | |
|  *
 | |
|  * Some optimisation to make the sure the content is only scanned by the PHP regex
 | |
|  * parser once. Lexer modes must not start with leading underscores.
 | |
|  */
 | |
| class Lexer
 | |
| {
 | |
|     /** @var ParallelRegex[] */
 | |
|     protected $regexes;
 | |
|     /** @var \Doku_Handler */
 | |
|     protected $handler;
 | |
|     /** @var StateStack */
 | |
|     protected $modeStack;
 | |
|     /** @var array mode "rewrites" */
 | |
|     protected $mode_handlers;
 | |
|     /** @var bool case sensitive? */
 | |
|     protected $case;
 | |
| 
 | |
|     /**
 | |
|      * Sets up the lexer in case insensitive matching by default.
 | |
|      *
 | |
|      * @param \Doku_Handler $handler  Handling strategy by reference.
 | |
|      * @param string $start            Starting handler.
 | |
|      * @param boolean $case            True for case sensitive.
 | |
|      */
 | |
|     public function __construct($handler, $start = "accept", $case = false)
 | |
|     {
 | |
|         $this->case = $case;
 | |
|         $this->regexes = array();
 | |
|         $this->handler = $handler;
 | |
|         $this->modeStack = new StateStack($start);
 | |
|         $this->mode_handlers = array();
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Adds a token search pattern for a particular parsing mode.
 | |
|      *
 | |
|      * The pattern does not change the current mode.
 | |
|      *
 | |
|      * @param string $pattern      Perl style regex, but ( and )
 | |
|      *                             lose the usual meaning.
 | |
|      * @param string $mode         Should only apply this
 | |
|      *                             pattern when dealing with
 | |
|      *                             this type of input.
 | |
|      */
 | |
|     public function addPattern($pattern, $mode = "accept")
 | |
|     {
 | |
|         if (! isset($this->regexes[$mode])) {
 | |
|             $this->regexes[$mode] = new ParallelRegex($this->case);
 | |
|         }
 | |
|         $this->regexes[$mode]->addPattern($pattern);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Adds a pattern that will enter a new parsing mode.
 | |
|      *
 | |
|      * Useful for entering parenthesis, strings, tags, etc.
 | |
|      *
 | |
|      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 | |
|      * @param string $mode         Should only apply this pattern when dealing with this type of input.
 | |
|      * @param string $new_mode     Change parsing to this new nested mode.
 | |
|      */
 | |
|     public function addEntryPattern($pattern, $mode, $new_mode)
 | |
|     {
 | |
|         if (! isset($this->regexes[$mode])) {
 | |
|             $this->regexes[$mode] = new ParallelRegex($this->case);
 | |
|         }
 | |
|         $this->regexes[$mode]->addPattern($pattern, $new_mode);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Adds a pattern that will exit the current mode and re-enter the previous one.
 | |
|      *
 | |
|      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 | |
|      * @param string $mode         Mode to leave.
 | |
|      */
 | |
|     public function addExitPattern($pattern, $mode)
 | |
|     {
 | |
|         if (! isset($this->regexes[$mode])) {
 | |
|             $this->regexes[$mode] = new ParallelRegex($this->case);
 | |
|         }
 | |
|         $this->regexes[$mode]->addPattern($pattern, "__exit");
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Adds a pattern that has a special mode.
 | |
|      *
 | |
|      * Acts as an entry and exit pattern in one go, effectively calling a special
 | |
|      * parser handler for this token only.
 | |
|      *
 | |
|      * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
 | |
|      * @param string $mode         Should only apply this pattern when dealing with this type of input.
 | |
|      * @param string $special      Use this mode for this one token.
 | |
|      */
 | |
|     public function addSpecialPattern($pattern, $mode, $special)
 | |
|     {
 | |
|         if (! isset($this->regexes[$mode])) {
 | |
|             $this->regexes[$mode] = new ParallelRegex($this->case);
 | |
|         }
 | |
|         $this->regexes[$mode]->addPattern($pattern, "_$special");
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Adds a mapping from a mode to another handler.
 | |
|      *
 | |
|      * @param string $mode        Mode to be remapped.
 | |
|      * @param string $handler     New target handler.
 | |
|      */
 | |
|     public function mapHandler($mode, $handler)
 | |
|     {
 | |
|         $this->mode_handlers[$mode] = $handler;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Splits the page text into tokens.
 | |
|      *
 | |
|      * Will fail if the handlers report an error or if no content is consumed. If successful then each
 | |
|      * unparsed and parsed token invokes a call to the held listener.
 | |
|      *
 | |
|      * @param string $raw        Raw HTML text.
 | |
|      * @return boolean           True on success, else false.
 | |
|      */
 | |
|     public function parse($raw)
 | |
|     {
 | |
|         if (! isset($this->handler)) {
 | |
|             return false;
 | |
|         }
 | |
|         $initialLength = strlen($raw);
 | |
|         $length = $initialLength;
 | |
|         $pos = 0;
 | |
|         while (is_array($parsed = $this->reduce($raw))) {
 | |
|             list($unmatched, $matched, $mode) = $parsed;
 | |
|             $currentLength = strlen($raw);
 | |
|             $matchPos = $initialLength - $currentLength - strlen($matched);
 | |
|             if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 | |
|                 return false;
 | |
|             }
 | |
|             if ($currentLength == $length) {
 | |
|                 return false;
 | |
|             }
 | |
|             $length = $currentLength;
 | |
|             $pos = $initialLength - $currentLength;
 | |
|         }
 | |
|         if (!$parsed) {
 | |
|             return false;
 | |
|         }
 | |
|         return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Gives plugins access to the mode stack
 | |
|      *
 | |
|      * @return StateStack
 | |
|      */
 | |
|     public function getModeStack()
 | |
|     {
 | |
|         return $this->modeStack;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Sends the matched token and any leading unmatched
 | |
|      * text to the parser changing the lexer to a new
 | |
|      * mode if one is listed.
 | |
|      *
 | |
|      * @param string $unmatched Unmatched leading portion.
 | |
|      * @param string $matched Actual token match.
 | |
|      * @param bool|string $mode Mode after match. A boolean false mode causes no change.
 | |
|      * @param int $initialPos
 | |
|      * @param int $matchPos Current byte index location in raw doc thats being parsed
 | |
|      * @return boolean             False if there was any error from the parser.
 | |
|      */
 | |
|     protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
 | |
|     {
 | |
|         if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
 | |
|             return false;
 | |
|         }
 | |
|         if ($this->isModeEnd($mode)) {
 | |
|             if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
 | |
|                 return false;
 | |
|             }
 | |
|             return $this->modeStack->leave();
 | |
|         }
 | |
|         if ($this->isSpecialMode($mode)) {
 | |
|             $this->modeStack->enter($this->decodeSpecial($mode));
 | |
|             if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 | |
|                 return false;
 | |
|             }
 | |
|             return $this->modeStack->leave();
 | |
|         }
 | |
|         if (is_string($mode)) {
 | |
|             $this->modeStack->enter($mode);
 | |
|             return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
 | |
|         }
 | |
|         return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
 | |
|      * mode stack.
 | |
|      *
 | |
|      * @param string $mode    Mode to test.
 | |
|      * @return boolean        True if this is the exit mode.
 | |
|      */
 | |
|     protected function isModeEnd($mode)
 | |
|     {
 | |
|         return ($mode === "__exit");
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Test to see if the mode is one where this mode is entered for this token only and automatically
 | |
|      * leaves immediately afterwoods.
 | |
|      *
 | |
|      * @param string $mode    Mode to test.
 | |
|      * @return boolean        True if this is the exit mode.
 | |
|      */
 | |
|     protected function isSpecialMode($mode)
 | |
|     {
 | |
|         return (strncmp($mode, "_", 1) == 0);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Strips the magic underscore marking single token modes.
 | |
|      *
 | |
|      * @param string $mode    Mode to decode.
 | |
|      * @return string         Underlying mode name.
 | |
|      */
 | |
|     protected function decodeSpecial($mode)
 | |
|     {
 | |
|         return substr($mode, 1);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Calls the parser method named after the current mode.
 | |
|      *
 | |
|      * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
 | |
|      *
 | |
|      * @param string $content Text parsed.
 | |
|      * @param boolean $is_match Token is recognised rather
 | |
|      *                               than unparsed data.
 | |
|      * @param int $pos Current byte index location in raw doc
 | |
|      *                             thats being parsed
 | |
|      * @return bool
 | |
|      */
 | |
|     protected function invokeHandler($content, $is_match, $pos)
 | |
|     {
 | |
|         if (($content === "") || ($content === false)) {
 | |
|             return true;
 | |
|         }
 | |
|         $handler = $this->modeStack->getCurrent();
 | |
|         if (isset($this->mode_handlers[$handler])) {
 | |
|             $handler = $this->mode_handlers[$handler];
 | |
|         }
 | |
| 
 | |
|         // modes starting with plugin_ are all handled by the same
 | |
|         // handler but with an additional parameter
 | |
|         if (substr($handler, 0, 7)=='plugin_') {
 | |
|             list($handler,$plugin) = explode('_', $handler, 2);
 | |
|             return $this->handler->$handler($content, $is_match, $pos, $plugin);
 | |
|         }
 | |
| 
 | |
|         return $this->handler->$handler($content, $is_match, $pos);
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
 | |
|      * unparsed data. Empty strings will not be matched.
 | |
|      *
 | |
|      * @param string $raw         The subject to parse. This is the content that will be eaten.
 | |
|      * @return array|bool         Three item list of unparsed content followed by the
 | |
|      *                            recognised token and finally the action the parser is to take.
 | |
|      *                            True if no match, false if there is a parsing error.
 | |
|      */
 | |
|     protected function reduce(&$raw)
 | |
|     {
 | |
|         if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
 | |
|             return false;
 | |
|         }
 | |
|         if ($raw === "") {
 | |
|             return true;
 | |
|         }
 | |
|         if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
 | |
|             list($unparsed, $match, $raw) = $split;
 | |
|             return array($unparsed, $match, $action);
 | |
|         }
 | |
|         return true;
 | |
|     }
 | |
| 
 | |
|     /**
 | |
|      * Escapes regex characters other than (, ) and /
 | |
|      *
 | |
|      * @param string $str
 | |
|      * @return string
 | |
|      */
 | |
|     public static function escape($str)
 | |
|     {
 | |
|         $chars = array(
 | |
|             '/\\\\/',
 | |
|             '/\./',
 | |
|             '/\+/',
 | |
|             '/\*/',
 | |
|             '/\?/',
 | |
|             '/\[/',
 | |
|             '/\^/',
 | |
|             '/\]/',
 | |
|             '/\$/',
 | |
|             '/\{/',
 | |
|             '/\}/',
 | |
|             '/\=/',
 | |
|             '/\!/',
 | |
|             '/\</',
 | |
|             '/\>/',
 | |
|             '/\|/',
 | |
|             '/\:/'
 | |
|         );
 | |
| 
 | |
|         $escaped = array(
 | |
|             '\\\\\\\\',
 | |
|             '\.',
 | |
|             '\+',
 | |
|             '\*',
 | |
|             '\?',
 | |
|             '\[',
 | |
|             '\^',
 | |
|             '\]',
 | |
|             '\$',
 | |
|             '\{',
 | |
|             '\}',
 | |
|             '\=',
 | |
|             '\!',
 | |
|             '\<',
 | |
|             '\>',
 | |
|             '\|',
 | |
|             '\:'
 | |
|         );
 | |
|         return preg_replace($chars, $escaped, $str);
 | |
|     }
 | |
| }
 |