146 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			146 lines
		
	
	
		
			5.2 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
<?php namespace Sieve;
 | 
						|
 | 
						|
include_once('SieveToken.php');
 | 
						|
 | 
						|
class SieveScanner
 | 
						|
{
 | 
						|
    public function __construct(&$script)
 | 
						|
    {
 | 
						|
        if ($script === null)
 | 
						|
            return;
 | 
						|
 | 
						|
        $this->tokenize($script);
 | 
						|
    }
 | 
						|
 | 
						|
    public function setPassthroughFunc($callback)
 | 
						|
    {
 | 
						|
        if ($callback == null || is_callable($callback))
 | 
						|
            $this->ptFn_ = $callback;
 | 
						|
    }
 | 
						|
 | 
						|
    public function tokenize(&$script)
 | 
						|
    {
 | 
						|
        $pos = 0;
 | 
						|
        $line = 1;
 | 
						|
 | 
						|
        $scriptLength = mb_strlen($script);
 | 
						|
 | 
						|
        $unprocessedScript = $script;
 | 
						|
 | 
						|
 | 
						|
        //create one regex to find the right match
 | 
						|
        //avoids looping over all possible tokens: increases performance
 | 
						|
        $nameToType = [];
 | 
						|
        $regex = [];
 | 
						|
        // chr(65) == 'A'
 | 
						|
        $i = 65;
 | 
						|
 | 
						|
        foreach ($this->tokenMatch_ as $type => $subregex) {
 | 
						|
            $nameToType[chr($i)] = $type;
 | 
						|
            $regex[] = "(?P<". chr($i) . ">^$subregex)";
 | 
						|
            $i++;
 | 
						|
        }
 | 
						|
 | 
						|
        $regex = '/' . join('|', $regex) . '/';
 | 
						|
 | 
						|
        while ($pos < $scriptLength)
 | 
						|
        {
 | 
						|
            if (preg_match($regex, $unprocessedScript, $match)) {
 | 
						|
 | 
						|
                // only keep the group that match and we only want matches with group names
 | 
						|
                // we can use the group name to find the token type using nameToType
 | 
						|
                $filterMatch = array_filter(array_filter($match), 'is_string', ARRAY_FILTER_USE_KEY);
 | 
						|
 | 
						|
                // the first element in filterMatch will contain the matched group and the key will be the name
 | 
						|
                $type = $nameToType[key($filterMatch)];
 | 
						|
                $currentMatch = current($filterMatch);
 | 
						|
 | 
						|
                //create the token
 | 
						|
                $token = new SieveToken($type, $currentMatch, $line);
 | 
						|
                $this->tokens_[] = $token;
 | 
						|
 | 
						|
                if ($type == SieveToken::Unknown)
 | 
						|
                    return;
 | 
						|
 | 
						|
                // just remove the part that we parsed: don't extract the new substring using script length
 | 
						|
                // as mb_strlen is \theta(pos)  (it's linear in the position)
 | 
						|
                $matchLength = mb_strlen($currentMatch);
 | 
						|
                $unprocessedScript = mb_substr($unprocessedScript, $matchLength);
 | 
						|
 | 
						|
                $pos += $matchLength;
 | 
						|
                $line += mb_substr_count($currentMatch, "\n");
 | 
						|
            } else {
 | 
						|
                $this->tokens_[] = new SieveToken(SieveToken::Unknown, '', $line);
 | 
						|
                return;
 | 
						|
            }
 | 
						|
 | 
						|
        }
 | 
						|
 | 
						|
        $this->tokens_[] = new SieveToken(SieveToken::ScriptEnd, '', $line);
 | 
						|
    }
 | 
						|
 | 
						|
    public function nextTokenIs($type)
 | 
						|
    {
 | 
						|
        return $this->peekNextToken()->is($type);
 | 
						|
    }
 | 
						|
 | 
						|
    public function peekNextToken()
 | 
						|
    {
 | 
						|
        $offset = 0;
 | 
						|
        do {
 | 
						|
            $next = $this->tokens_[$this->tokenPos_ + $offset++];
 | 
						|
        } while ($next->is(SieveToken::Comment|SieveToken::Whitespace));
 | 
						|
 | 
						|
        return $next;
 | 
						|
    }
 | 
						|
 | 
						|
    public function nextToken()
 | 
						|
    {
 | 
						|
        $token = $this->tokens_[$this->tokenPos_++];
 | 
						|
 | 
						|
        while ($token->is(SieveToken::Comment|SieveToken::Whitespace))
 | 
						|
        {
 | 
						|
            if ($this->ptFn_ != null)
 | 
						|
                call_user_func($this->ptFn_, $token);
 | 
						|
 | 
						|
            $token = $this->tokens_[$this->tokenPos_++];
 | 
						|
        }
 | 
						|
 | 
						|
        return $token;
 | 
						|
    }
 | 
						|
 | 
						|
    protected $ptFn_ = null;
 | 
						|
    protected $tokenPos_ = 0;
 | 
						|
    protected $tokens_ = array();
 | 
						|
    protected $tokenMatch_ = array (
 | 
						|
        SieveToken::LeftBracket       =>  '\[',
 | 
						|
        SieveToken::RightBracket      =>  '\]',
 | 
						|
        SieveToken::BlockStart        =>  '\{',
 | 
						|
        SieveToken::BlockEnd          =>  '\}',
 | 
						|
        SieveToken::LeftParenthesis   =>  '\(',
 | 
						|
        SieveToken::RightParenthesis  =>  '\)',
 | 
						|
        SieveToken::Comma             =>  ',',
 | 
						|
        SieveToken::Semicolon         =>  ';',
 | 
						|
        SieveToken::Whitespace        =>  '[ \r\n\t]+',
 | 
						|
        SieveToken::Tag               =>  ':[[:alpha:]_][[:alnum:]_]*(?=\b)',
 | 
						|
        /*
 | 
						|
        "                           # match a quotation mark
 | 
						|
        (                           # start matching parts that include an escaped quotation mark
 | 
						|
        ([^"]*[^"\\\\])             # match a string without quotation marks and not ending with a backlash
 | 
						|
        ?                           # this also includes the empty string
 | 
						|
        (\\\\\\\\)*                 # match any groups of even number of backslashes
 | 
						|
                                    # (thus the character after these groups are not escaped)
 | 
						|
        \\\\"                       # match an escaped quotation mark
 | 
						|
        )*                          # accept any number of strings that end with an escaped quotation mark
 | 
						|
        [^"]*                       # accept any trailing part that does not contain any quotation marks
 | 
						|
        "                           # end of the quoted string
 | 
						|
        */
 | 
						|
        SieveToken::QuotedString      =>  '"(([^"]*[^"\\\\])?(\\\\\\\\)*\\\\")*[^"]*"',
 | 
						|
        SieveToken::Number            =>  '[[:digit:]]+(?:[KMG])?(?=\b)',
 | 
						|
        SieveToken::Comment           =>  '(?:\/\*(?:[^\*]|\*(?=[^\/]))*\*\/|#[^\r\n]*\r?(\n|$))',
 | 
						|
        SieveToken::MultilineString   =>  'text:[ \t]*(?:#[^\r\n]*)?\r?\n(\.[^\r\n]+\r?\n|[^\.][^\r\n]*\r?\n)*\.\r?(\n|$)',
 | 
						|
        SieveToken::Identifier        =>  '[[:alpha:]_][[:alnum:]_]*(?=\b)',
 | 
						|
        SieveToken::Unknown           =>  '[^ \r\n\t]+'
 | 
						|
    );
 | 
						|
}
 |