From 3d15ac54e36481490cb85477bee9d3abfe8b8f43 Mon Sep 17 00:00:00 2001 From: Sven Wappler Date: Mon, 1 Jan 2018 05:44:09 +0100 Subject: [PATCH] Improvement of bot/spider detection --- Classes/DeviceDetector/DeviceDetector.php | 263 +++ Classes/DeviceDetector/Parser/Bot.php | 72 + .../DeviceDetector/Parser/ParserAbstract.php | 269 +++ Classes/DeviceDetector/Yaml/Inline.php | 567 ++++++ .../DeviceDetector/Yaml/ParseException.php | 141 ++ Classes/DeviceDetector/Yaml/Parser.php | 777 ++++++++ Classes/DeviceDetector/Yaml/Unescaper.php | 141 ++ Classes/Helper.php | 13 +- Configuration/YAML/bots.yml | 1572 +++++++++++++++++ 9 files changed, 3812 insertions(+), 3 deletions(-) create mode 100644 Classes/DeviceDetector/DeviceDetector.php create mode 100644 Classes/DeviceDetector/Parser/Bot.php create mode 100644 Classes/DeviceDetector/Parser/ParserAbstract.php create mode 100644 Classes/DeviceDetector/Yaml/Inline.php create mode 100644 Classes/DeviceDetector/Yaml/ParseException.php create mode 100644 Classes/DeviceDetector/Yaml/Parser.php create mode 100644 Classes/DeviceDetector/Yaml/Unescaper.php create mode 100644 Configuration/YAML/bots.yml diff --git a/Classes/DeviceDetector/DeviceDetector.php b/Classes/DeviceDetector/DeviceDetector.php new file mode 100644 index 0000000..d76b473 --- /dev/null +++ b/Classes/DeviceDetector/DeviceDetector.php @@ -0,0 +1,263 @@ +setUserAgent($userAgent); + } + + } + + /** + * Sets the useragent to be parsed + * + * @param string $userAgent + */ + public function setUserAgent($userAgent) + { + if ($this->userAgent != $userAgent) { + $this->reset(); + } + $this->userAgent = $userAgent; + } + + protected function reset() + { + $this->bot = null; + $this->parsed = false; + } + + + + /** + * Sets whether to discard additional bot information + * If information is discarded it's only possible check whether UA was detected as bot or not. + * (Discarding information speeds up the detection a bit) + * + * @param bool $discard + */ + public function discardBotInformation($discard = true) + { + $this->discardBotInformation = $discard; + } + + /** + * Sets whether to skip bot detection. + * It is needed if we want bots to be processed as a simple clients. So we can detect if it is mobile client, + * or desktop, or enything else. By default all this information is not retrieved for the bots. + * + * @param bool $skip + */ + public function skipBotDetection($skip = true) + { + $this->skipBotDetection = $skip; + } + + /** + * Returns if the parsed UA was identified as a Bot + * + * @see bots.yml for a list of detected bots + * + * @return bool + */ + public function isBot() + { + return !empty($this->bot); + } + + + /** + * Returns the user agent that is set to be parsed + * + * @return string + */ + public function getUserAgent() + { + return $this->userAgent; + } + + /** + * Returns the bot extracted from the parsed UA + * + * @return array + */ + public function getBot() + { + return $this->bot; + } + + /** + * Returns true, if userAgent was already parsed with parse() + * + * @return bool + */ + public function isParsed() + { + return $this->parsed; + } + + /** + * Triggers the parsing of the current user agent + * @throws \Exception + */ + public function parse() + { + if ($this->isParsed()) { + return; + } + + $this->parsed = true; + + // skip parsing for empty useragents or those not containing any letter + if (empty($this->userAgent) || !preg_match('/([a-z])/i', $this->userAgent)) { + return; + } + + $this->parseBot(); + if ($this->isBot()) { + return; + } + + } + + /** + * Parses the UA for bot information using the Bot parser + * @throws \Exception + * @return void + */ + protected function parseBot() + { + if ($this->skipBotDetection) { + $this->bot = false; + return; + } + + $botParser = new Bot(); + $botParser->setUserAgent($this->getUserAgent()); + $botParser->setYamlParser($this->getYamlParser()); + if ($this->discardBotInformation) { + $botParser->discardDetails(); + } + $this->bot = $botParser->parse(); + + } + + + + protected function matchUserAgent($regex) + { + $regex = '/(?:^|[^A-Z_-])(?:' . str_replace('/', '\/', $regex) . ')/i'; + + if (preg_match($regex, $this->userAgent, $matches)) { + return $matches; + } + + return false; + } + + + + /** + * Sets the Yaml Parser class + * + * @param YamlParser + * @throws \Exception + */ + public function setYamlParser($yamlParser) + { + if ($yamlParser instanceof YamlParser) { + $this->yamlParser = $yamlParser; + return; + } + + throw new \Exception('Yaml Parser not supported'); + } + + /** + * Returns Yaml Parser object + * + * @return YamlParser + */ + public function getYamlParser() + { + if (!empty($this->yamlParser)) { + return $this->yamlParser; + } + + return new Spyc(); + } +} diff --git a/Classes/DeviceDetector/Parser/Bot.php b/Classes/DeviceDetector/Parser/Bot.php new file mode 100644 index 0000000..a2da5b6 --- /dev/null +++ b/Classes/DeviceDetector/Parser/Bot.php @@ -0,0 +1,72 @@ +discardDetails = true; + } + + /** + * Parses the current UA and checks whether it contains bot information + * + * @see bots.yml for list of detected bots + * + * Step 1: Build a big regex containing all regexes and match UA against it + * -> If no matches found: return + * -> Otherwise: + * Step 2: Walk through the list of regexes in bots.yml and try to match every one + * -> Return the matched data + * + * If $discardDetails is set to TRUE, the Step 2 will be skipped + * $bot will be set to TRUE instead + * + * NOTE: Doing the big match before matching every single regex speeds up the detection + */ + public function parse() + { + $result = null; + + if ($this->preMatchOverall()) { + foreach ($this->getRegexes() as $regex) { + $matches = $this->matchUserAgent($regex['regex']); + + if ($matches) { + if ($this->discardDetails) { + $result = true; + break; + } + + unset($regex['regex']); + $result = $regex; + break; + } + } + } + + return $result; + } +} diff --git a/Classes/DeviceDetector/Parser/ParserAbstract.php b/Classes/DeviceDetector/Parser/ParserAbstract.php new file mode 100644 index 0000000..8fa98b5 --- /dev/null +++ b/Classes/DeviceDetector/Parser/ParserAbstract.php @@ -0,0 +1,269 @@ +setUserAgent($ua); + } + + /** + * Set how DeviceDetector should return versions + * @param int|null $type Any of the VERSION_TRUNCATION_* constants + */ + public static function setVersionTruncation($type) + { + if (in_array($type, array(self::VERSION_TRUNCATION_BUILD, + self::VERSION_TRUNCATION_NONE, + self::VERSION_TRUNCATION_MAJOR, + self::VERSION_TRUNCATION_MINOR, + self::VERSION_TRUNCATION_PATCH))) { + self::$maxMinorParts = $type; + } + } + + /** + * Sets the user agent to parse + * + * @param string $ua user agent + */ + public function setUserAgent($ua) + { + $this->userAgent = $ua; + } + + /** + * Returns the internal name of the parser + * + * @return string + */ + public function getName() + { + return $this->parserName; + } + + /** + * Returns the result of the parsed yml file defined in $fixtureFile + * + * @return array + */ + protected function getRegexes() + { + if (empty($this->regexList)) { + $this->regexList = $this->getYamlParser()->parse( + file_get_contents(GeneralUtility::getFileAbsFileName('EXT:abtest2/Configuration/YAML/'.$this->fixtureFile)) + ); + } + return $this->regexList; + } + + /** + * @return string + */ + protected function getRegexesDirectory() + { + return dirname(__DIR__); + } + + /** + * Matches the useragent against the given regex + * + * @param $regex + * @return array|bool + */ + protected function matchUserAgent($regex) + { + // only match if useragent begins with given regex or there is no letter before it + $regex = '/(?:^|[^A-Z0-9\-_]|[^A-Z0-9\-]_|sprd-)(?:' . str_replace('/', '\/', $regex) . ')/i'; + + if (preg_match($regex, $this->userAgent, $matches)) { + return $matches; + } + + return false; + } + + /** + * @param string $item + * @param array $matches + * @return string type + */ + protected function buildByMatch($item, $matches) + { + for ($nb=1;$nb<=3;$nb++) { + if (strpos($item, '$' . $nb) === false) { + continue; + } + + $replace = isset($matches[$nb]) ? $matches[$nb] : ''; + $item = trim(str_replace('$' . $nb, $replace, $item)); + } + return $item; + } + + /** + * Builds the version with the given $versionString and $matches + * + * Example: + * $versionString = 'v$2' + * $matches = array('version_1_0_1', '1_0_1') + * return value would be v1.0.1 + * + * @param $versionString + * @param $matches + * @return mixed|string + */ + protected function buildVersion($versionString, $matches) + { + $versionString = $this->buildByMatch($versionString, $matches); + $versionString = str_replace('_', '.', $versionString); + if (null !== self::$maxMinorParts && substr_count($versionString, '.') > self::$maxMinorParts) { + $versionParts = explode('.', $versionString); + $versionParts = array_slice($versionParts, 0, 1+self::$maxMinorParts); + $versionString = implode('.', $versionParts); + } + return trim($versionString, ' .'); + } + + /** + * Tests the useragent against a combination of all regexes + * + * All regexes returned by getRegexes() will be reversed and concated with '|' + * Afterwards the big regex will be tested against the user agent + * + * Method can be used to speed up detections by making a big check before doing checks for every single regex + * + * @return bool + */ + protected function preMatchOverall() + { + $regexes = $this->getRegexes(); + + static $overAllMatch; + + if (empty($overAllMatch)) { + // reverse all regexes, so we have the generic one first, which already matches most patterns + $overAllMatch = array_reduce(array_reverse($regexes), function ($val1, $val2) { + if (!empty($val1)) { + return $val1.'|'.$val2['regex']; + } else { + return $val2['regex']; + } + }); + } + + return $this->matchUserAgent($overAllMatch); + } + + + + /** + * Sets the YamlParser class + * + * @param Parser + */ + public function setYamlParser($yamlParser) + { + $this->yamlParser = $yamlParser; + } + + /** + * Returns Parser object + * + * @return Parser + */ + public function getYamlParser() + { + return $this->yamlParser; + } +} diff --git a/Classes/DeviceDetector/Yaml/Inline.php b/Classes/DeviceDetector/Yaml/Inline.php new file mode 100644 index 0000000..988ad3b --- /dev/null +++ b/Classes/DeviceDetector/Yaml/Inline.php @@ -0,0 +1,567 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace WapplerSystems\ABTest2\DeviceDetector\Yaml; + +use Symfony\Component\Yaml\Exception\DumpException; + +/** + * Inline implements a YAML parser/dumper for the YAML inline syntax. + * + * @author Fabien Potencier + */ +class Inline +{ + const REGEX_QUOTED_STRING = '(?:"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'([^\']*(?:\'\'[^\']*)*)\')'; + + private static $exceptionOnInvalidType = false; + private static $objectSupport = false; + private static $objectForMap = false; + + /** + * Converts a YAML string to a PHP array. + * + * @param string $value A YAML string + * @param bool $exceptionOnInvalidType true if an exception must be thrown on invalid types (a PHP resource or object), false otherwise + * @param bool $objectSupport true if object support is enabled, false otherwise + * @param bool $objectForMap true if maps should return a stdClass instead of array() + * @param array $references Mapping of variable names to values + * + * @return array A PHP array representing the YAML string + * + * @throws ParseException + */ + public static function parse($value, $exceptionOnInvalidType = false, $objectSupport = false, $objectForMap = false, $references = array()) + { + self::$exceptionOnInvalidType = $exceptionOnInvalidType; + self::$objectSupport = $objectSupport; + self::$objectForMap = $objectForMap; + + $value = trim($value); + + if ('' === $value) { + return ''; + } + + if (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) { + $mbEncoding = mb_internal_encoding(); + mb_internal_encoding('ASCII'); + } + + $i = 0; + switch ($value[0]) { + case '[': + $result = self::parseSequence($value, $i, $references); + ++$i; + break; + case '{': + $result = self::parseMapping($value, $i, $references); + ++$i; + break; + default: + $result = self::parseScalar($value, null, array('"', "'"), $i, true, $references); + } + + // some comments are allowed at the end + if (preg_replace('/\s+#.*$/A', '', substr($value, $i))) { + throw new ParseException(sprintf('Unexpected characters near "%s".', substr($value, $i))); + } + + if (isset($mbEncoding)) { + mb_internal_encoding($mbEncoding); + } + + return $result; + } + + /** + * Dumps a given PHP variable to a YAML string. + * + * @param mixed $value The PHP variable to convert + * @param bool $exceptionOnInvalidType true if an exception must be thrown on invalid types (a PHP resource or object), false otherwise + * @param bool $objectSupport true if object support is enabled, false otherwise + * + * @return string The YAML string representing the PHP array + * + * @throws DumpException When trying to dump PHP resource + */ + public static function dump($value, $exceptionOnInvalidType = false, $objectSupport = false) + { + switch (true) { + case is_resource($value): + if ($exceptionOnInvalidType) { + throw new DumpException(sprintf('Unable to dump PHP resources in a YAML file ("%s").', get_resource_type($value))); + } + + return 'null'; + case is_object($value): + if ($objectSupport) { + return '!php/object:'.serialize($value); + } + + if ($exceptionOnInvalidType) { + throw new DumpException('Object support when dumping a YAML file has been disabled.'); + } + + return 'null'; + case is_array($value): + return self::dumpArray($value, $exceptionOnInvalidType, $objectSupport); + case null === $value: + return 'null'; + case true === $value: + return 'true'; + case false === $value: + return 'false'; + case ctype_digit($value): + return is_string($value) ? "'$value'" : (int) $value; + case is_numeric($value): + $locale = setlocale(LC_NUMERIC, 0); + if (false !== $locale) { + setlocale(LC_NUMERIC, 'C'); + } + if (is_float($value)) { + $repr = (string) $value; + if (is_infinite($value)) { + $repr = str_ireplace('INF', '.Inf', $repr); + } elseif (floor($value) == $value && $repr == $value) { + // Preserve float data type since storing a whole number will result in integer value. + $repr = '!!float '.$repr; + } + } else { + $repr = is_string($value) ? "'$value'" : (string) $value; + } + if (false !== $locale) { + setlocale(LC_NUMERIC, $locale); + } + + return $repr; + case '' == $value: + return "''"; + case Escaper::requiresDoubleQuoting($value): + return Escaper::escapeWithDoubleQuotes($value); + case Escaper::requiresSingleQuoting($value): + case preg_match(self::getHexRegex(), $value): + case preg_match(self::getTimestampRegex(), $value): + return Escaper::escapeWithSingleQuotes($value); + default: + return $value; + } + } + + /** + * Dumps a PHP array to a YAML string. + * + * @param array $value The PHP array to dump + * @param bool $exceptionOnInvalidType true if an exception must be thrown on invalid types (a PHP resource or object), false otherwise + * @param bool $objectSupport true if object support is enabled, false otherwise + * + * @return string The YAML string representing the PHP array + */ + private static function dumpArray($value, $exceptionOnInvalidType, $objectSupport) + { + // array + $keys = array_keys($value); + $keysCount = count($keys); + if ((1 === $keysCount && '0' == $keys[0]) + || ($keysCount > 1 && array_reduce($keys, function ($v, $w) { return (int) $v + $w; }, 0) === $keysCount * ($keysCount - 1) / 2) + ) { + $output = array(); + foreach ($value as $val) { + $output[] = self::dump($val, $exceptionOnInvalidType, $objectSupport); + } + + return sprintf('[%s]', implode(', ', $output)); + } + + // mapping + $output = array(); + foreach ($value as $key => $val) { + $output[] = sprintf('%s: %s', self::dump($key, $exceptionOnInvalidType, $objectSupport), self::dump($val, $exceptionOnInvalidType, $objectSupport)); + } + + return sprintf('{ %s }', implode(', ', $output)); + } + + /** + * Parses a scalar to a YAML string. + * + * @param string $scalar + * @param string $delimiters + * @param array $stringDelimiters + * @param int &$i + * @param bool $evaluate + * @param array $references + * + * @return string A YAML string + * + * @throws ParseException When malformed inline YAML string is parsed + * + * @internal + */ + public static function parseScalar($scalar, $delimiters = null, $stringDelimiters = array('"', "'"), &$i = 0, $evaluate = true, $references = array()) + { + if (in_array($scalar[$i], $stringDelimiters)) { + // quoted scalar + $output = self::parseQuotedScalar($scalar, $i); + + if (null !== $delimiters) { + $tmp = ltrim(substr($scalar, $i), ' '); + if (!in_array($tmp[0], $delimiters)) { + throw new ParseException(sprintf('Unexpected characters (%s).', substr($scalar, $i))); + } + } + } else { + // "normal" string + if (!$delimiters) { + $output = substr($scalar, $i); + $i += strlen($output); + + // remove comments + if (preg_match('/[ \t]+#/', $output, $match, PREG_OFFSET_CAPTURE)) { + $output = substr($output, 0, $match[0][1]); + } + } elseif (preg_match('/^(.+?)('.implode('|', $delimiters).')/', substr($scalar, $i), $match)) { + $output = $match[1]; + $i += strlen($output); + } else { + throw new ParseException(sprintf('Malformed inline YAML string (%s).', $scalar)); + } + + // a non-quoted string cannot start with @ or ` (reserved) nor with a scalar indicator (| or >) + if ($output && ('@' === $output[0] || '`' === $output[0] || '|' === $output[0] || '>' === $output[0])) { + throw new ParseException(sprintf('The reserved indicator "%s" cannot start a plain scalar; you need to quote the scalar.', $output[0])); + } + + if ($evaluate) { + $output = self::evaluateScalar($output, $references); + } + } + + return $output; + } + + /** + * Parses a quoted scalar to YAML. + * + * @param string $scalar + * @param int &$i + * + * @return string A YAML string + * + * @throws ParseException When malformed inline YAML string is parsed + */ + private static function parseQuotedScalar($scalar, &$i) + { + if (!preg_match('/'.self::REGEX_QUOTED_STRING.'/Au', substr($scalar, $i), $match)) { + throw new ParseException(sprintf('Malformed inline YAML string (%s).', substr($scalar, $i))); + } + + $output = substr($match[0], 1, strlen($match[0]) - 2); + + $unescaper = new Unescaper(); + if ('"' == $scalar[$i]) { + $output = $unescaper->unescapeDoubleQuotedString($output); + } else { + $output = $unescaper->unescapeSingleQuotedString($output); + } + + $i += strlen($match[0]); + + return $output; + } + + /** + * Parses a sequence to a YAML string. + * + * @param string $sequence + * @param int &$i + * @param array $references + * + * @return string A YAML string + * + * @throws ParseException When malformed inline YAML string is parsed + */ + private static function parseSequence($sequence, &$i = 0, $references = array()) + { + $output = array(); + $len = strlen($sequence); + ++$i; + + // [foo, bar, ...] + while ($i < $len) { + switch ($sequence[$i]) { + case '[': + // nested sequence + $output[] = self::parseSequence($sequence, $i, $references); + break; + case '{': + // nested mapping + $output[] = self::parseMapping($sequence, $i, $references); + break; + case ']': + return $output; + case ',': + case ' ': + break; + default: + $isQuoted = in_array($sequence[$i], array('"', "'")); + $value = self::parseScalar($sequence, array(',', ']'), array('"', "'"), $i, true, $references); + + // the value can be an array if a reference has been resolved to an array var + if (!is_array($value) && !$isQuoted && false !== strpos($value, ': ')) { + // embedded mapping? + try { + $pos = 0; + $value = self::parseMapping('{'.$value.'}', $pos, $references); + } catch (\InvalidArgumentException $e) { + // no, it's not + } + } + + $output[] = $value; + + --$i; + } + + ++$i; + } + + throw new ParseException(sprintf('Malformed inline YAML string %s', $sequence)); + } + + /** + * Parses a mapping to a YAML string. + * + * @param string $mapping + * @param int &$i + * @param array $references + * + * @return string A YAML string + * + * @throws ParseException When malformed inline YAML string is parsed + */ + private static function parseMapping($mapping, &$i = 0, $references = array()) + { + $output = array(); + $len = strlen($mapping); + ++$i; + + // {foo: bar, bar:foo, ...} + while ($i < $len) { + switch ($mapping[$i]) { + case ' ': + case ',': + ++$i; + continue 2; + case '}': + if (self::$objectForMap) { + return (object) $output; + } + + return $output; + } + + // key + $key = self::parseScalar($mapping, array(':', ' '), array('"', "'"), $i, false); + + // value + $done = false; + + while ($i < $len) { + switch ($mapping[$i]) { + case '[': + // nested sequence + $value = self::parseSequence($mapping, $i, $references); + // Spec: Keys MUST be unique; first one wins. + // Parser cannot abort this mapping earlier, since lines + // are processed sequentially. + if (!isset($output[$key])) { + $output[$key] = $value; + } + $done = true; + break; + case '{': + // nested mapping + $value = self::parseMapping($mapping, $i, $references); + // Spec: Keys MUST be unique; first one wins. + // Parser cannot abort this mapping earlier, since lines + // are processed sequentially. + if (!isset($output[$key])) { + $output[$key] = $value; + } + $done = true; + break; + case ':': + case ' ': + break; + default: + $value = self::parseScalar($mapping, array(',', '}'), array('"', "'"), $i, true, $references); + // Spec: Keys MUST be unique; first one wins. + // Parser cannot abort this mapping earlier, since lines + // are processed sequentially. + if (!isset($output[$key])) { + $output[$key] = $value; + } + $done = true; + --$i; + } + + ++$i; + + if ($done) { + continue 2; + } + } + } + + throw new ParseException(sprintf('Malformed inline YAML string %s', $mapping)); + } + + /** + * Evaluates scalars and replaces magic values. + * + * @param string $scalar + * @param array $references + * + * @return string A YAML string + * + * @throws ParseException when object parsing support was disabled and the parser detected a PHP object or when a reference could not be resolved + */ + private static function evaluateScalar($scalar, $references = array()) + { + $scalar = trim($scalar); + $scalarLower = strtolower($scalar); + + if (0 === strpos($scalar, '*')) { + if (false !== $pos = strpos($scalar, '#')) { + $value = substr($scalar, 1, $pos - 2); + } else { + $value = substr($scalar, 1); + } + + // an unquoted * + if (false === $value || '' === $value) { + throw new ParseException('A reference must contain at least one character.'); + } + + if (!array_key_exists($value, $references)) { + throw new ParseException(sprintf('Reference "%s" does not exist.', $value)); + } + + return $references[$value]; + } + + switch (true) { + case 'null' === $scalarLower: + case '' === $scalar: + case '~' === $scalar: + return; + case 'true' === $scalarLower: + return true; + case 'false' === $scalarLower: + return false; + // Optimise for returning strings. + case $scalar[0] === '+' || $scalar[0] === '-' || $scalar[0] === '.' || $scalar[0] === '!' || is_numeric($scalar[0]): + switch (true) { + case 0 === strpos($scalar, '!str'): + return (string) substr($scalar, 5); + case 0 === strpos($scalar, '! '): + return (int) self::parseScalar(substr($scalar, 2)); + case 0 === strpos($scalar, '!php/object:'): + if (self::$objectSupport) { + return unserialize(substr($scalar, 12)); + } + + if (self::$exceptionOnInvalidType) { + throw new ParseException('Object support when parsing a YAML file has been disabled.'); + } + + return; + case 0 === strpos($scalar, '!!php/object:'): + if (self::$objectSupport) { + return unserialize(substr($scalar, 13)); + } + + if (self::$exceptionOnInvalidType) { + throw new ParseException('Object support when parsing a YAML file has been disabled.'); + } + + return; + case 0 === strpos($scalar, '!!float '): + return (float) substr($scalar, 8); + case ctype_digit($scalar): + $raw = $scalar; + $cast = (int) $scalar; + + return '0' == $scalar[0] ? octdec($scalar) : (((string) $raw == (string) $cast) ? $cast : $raw); + case '-' === $scalar[0] && ctype_digit(substr($scalar, 1)): + $raw = $scalar; + $cast = (int) $scalar; + + return '0' == $scalar[1] ? octdec($scalar) : (((string) $raw === (string) $cast) ? $cast : $raw); + case is_numeric($scalar): + case preg_match(self::getHexRegex(), $scalar): + return '0x' === $scalar[0].$scalar[1] ? hexdec($scalar) : (float) $scalar; + case '.inf' === $scalarLower: + case '.nan' === $scalarLower: + return -log(0); + case '-.inf' === $scalarLower: + return log(0); + case preg_match('/^(-|\+)?[0-9,]+(\.[0-9]+)?$/', $scalar): + return (float) str_replace(',', '', $scalar); + case preg_match(self::getTimestampRegex(), $scalar): + $timeZone = date_default_timezone_get(); + date_default_timezone_set('UTC'); + $time = strtotime($scalar); + date_default_timezone_set($timeZone); + + return $time; + } + default: + return (string) $scalar; + } + } + + /** + * Gets a regex that matches a YAML date. + * + * @return string The regular expression + * + * @see http://www.yaml.org/spec/1.2/spec.html#id2761573 + */ + private static function getTimestampRegex() + { + return <<[0-9][0-9][0-9][0-9]) + -(?P[0-9][0-9]?) + -(?P[0-9][0-9]?) + (?:(?:[Tt]|[ \t]+) + (?P[0-9][0-9]?) + :(?P[0-9][0-9]) + :(?P[0-9][0-9]) + (?:\.(?P[0-9]*))? + (?:[ \t]*(?PZ|(?P[-+])(?P[0-9][0-9]?) + (?::(?P[0-9][0-9]))?))?)? + $~x +EOF; + } + + /** + * Gets a regex that matches a YAML number in hexadecimal notation. + * + * @return string + */ + private static function getHexRegex() + { + return '~^0x[0-9a-f]++$~i'; + } +} diff --git a/Classes/DeviceDetector/Yaml/ParseException.php b/Classes/DeviceDetector/Yaml/ParseException.php new file mode 100644 index 0000000..8a230c3 --- /dev/null +++ b/Classes/DeviceDetector/Yaml/ParseException.php @@ -0,0 +1,141 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace WapplerSystems\ABTest2\DeviceDetector\Yaml; + +/** + * Exception class thrown when an error occurs during parsing. + * + * @author Fabien Potencier + */ +class ParseException extends \RuntimeException +{ + private $parsedFile; + private $parsedLine; + private $snippet; + private $rawMessage; + + /** + * Constructor. + * + * @param string $message The error message + * @param int $parsedLine The line where the error occurred + * @param int $snippet The snippet of code near the problem + * @param string $parsedFile The file name where the error occurred + * @param \Exception $previous The previous exception + */ + public function __construct($message, $parsedLine = -1, $snippet = null, $parsedFile = null, \Exception $previous = null) + { + $this->parsedFile = $parsedFile; + $this->parsedLine = $parsedLine; + $this->snippet = $snippet; + $this->rawMessage = $message; + + $this->updateRepr(); + + parent::__construct($this->message, 0, $previous); + } + + /** + * Gets the snippet of code near the error. + * + * @return string The snippet of code + */ + public function getSnippet() + { + return $this->snippet; + } + + /** + * Sets the snippet of code near the error. + * + * @param string $snippet The code snippet + */ + public function setSnippet($snippet) + { + $this->snippet = $snippet; + + $this->updateRepr(); + } + + /** + * Gets the filename where the error occurred. + * + * This method returns null if a string is parsed. + * + * @return string The filename + */ + public function getParsedFile() + { + return $this->parsedFile; + } + + /** + * Sets the filename where the error occurred. + * + * @param string $parsedFile The filename + */ + public function setParsedFile($parsedFile) + { + $this->parsedFile = $parsedFile; + + $this->updateRepr(); + } + + /** + * Gets the line where the error occurred. + * + * @return int The file line + */ + public function getParsedLine() + { + return $this->parsedLine; + } + + /** + * Sets the line where the error occurred. + * + * @param int $parsedLine The file line + */ + public function setParsedLine($parsedLine) + { + $this->parsedLine = $parsedLine; + + $this->updateRepr(); + } + + private function updateRepr() + { + $this->message = $this->rawMessage; + + $dot = false; + if ('.' === substr($this->message, -1)) { + $this->message = substr($this->message, 0, -1); + $dot = true; + } + + if (null !== $this->parsedFile) { + $this->message .= sprintf(' in %s', json_encode($this->parsedFile, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE)); + } + + if ($this->parsedLine >= 0) { + $this->message .= sprintf(' at line %d', $this->parsedLine); + } + + if ($this->snippet) { + $this->message .= sprintf(' (near "%s")', $this->snippet); + } + + if ($dot) { + $this->message .= '.'; + } + } +} diff --git a/Classes/DeviceDetector/Yaml/Parser.php b/Classes/DeviceDetector/Yaml/Parser.php new file mode 100644 index 0000000..304d47e --- /dev/null +++ b/Classes/DeviceDetector/Yaml/Parser.php @@ -0,0 +1,777 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace WapplerSystems\ABTest2\DeviceDetector\Yaml; + + +/** + * Parser parses YAML strings to convert them to PHP arrays. + * + * @author Fabien Potencier + */ +class Parser +{ + const BLOCK_SCALAR_HEADER_PATTERN = '(?P\||>)(?P\+|\-|\d+|\+\d+|\-\d+|\d+\+|\d+\-)?(?P +#.*)?'; + + private $offset = 0; + private $lines = array(); + private $currentLineNb = -1; + private $currentLine = ''; + private $refs = array(); + + /** + * Constructor. + * + * @param int $offset The offset of YAML document (used for line numbers in error messages) + */ + public function __construct($offset = 0) + { + $this->offset = $offset; + } + + /** + * Parses a YAML string to a PHP value. + * + * @param string $value A YAML string + * @param bool $exceptionOnInvalidType true if an exception must be thrown on invalid types (a PHP resource or object), false otherwise + * @param bool $objectSupport true if object support is enabled, false otherwise + * @param bool $objectForMap true if maps should return a stdClass instead of array() + * + * @return mixed A PHP value + * + * @throws ParseException If the YAML is not valid + */ + public function parse($value, $exceptionOnInvalidType = false, $objectSupport = false, $objectForMap = false) + { + if (!preg_match('//u', $value)) { + throw new ParseException('The YAML value does not appear to be valid UTF-8.'); + } + $this->currentLineNb = -1; + $this->currentLine = ''; + $value = $this->cleanup($value); + $this->lines = explode("\n", $value); + + if (2 /* MB_OVERLOAD_STRING */ & (int) ini_get('mbstring.func_overload')) { + $mbEncoding = mb_internal_encoding(); + mb_internal_encoding('UTF-8'); + } + + $data = array(); + $context = null; + $allowOverwrite = false; + while ($this->moveToNextLine()) { + if ($this->isCurrentLineEmpty()) { + continue; + } + + // tab? + if ("\t" === $this->currentLine[0]) { + throw new ParseException('A YAML file cannot contain tabs as indentation.', $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + + $isRef = $mergeNode = false; + if (preg_match('#^\-((?P\s+)(?P.+?))?\s*$#u', $this->currentLine, $values)) { + if ($context && 'mapping' == $context) { + throw new ParseException('You cannot define a sequence item when in a mapping'); + } + $context = 'sequence'; + + if (isset($values['value']) && preg_match('#^&(?P[^ ]+) *(?P.*)#u', $values['value'], $matches)) { + $isRef = $matches['ref']; + $values['value'] = $matches['value']; + } + + // array + if (!isset($values['value']) || '' == trim($values['value'], ' ') || 0 === strpos(ltrim($values['value'], ' '), '#')) { + $c = $this->getRealCurrentLineNb() + 1; + $parser = new self($c); + $parser->refs = &$this->refs; + $data[] = $parser->parse($this->getNextEmbedBlock(null, true), $exceptionOnInvalidType, $objectSupport, $objectForMap); + } else { + if (isset($values['leadspaces']) + && preg_match('#^(?P'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\{\[].*?) *\:(\s+(?P.+?))?\s*$#u', $values['value'], $matches) + ) { + // this is a compact notation element, add to next block and parse + $c = $this->getRealCurrentLineNb(); + $parser = new self($c); + $parser->refs = &$this->refs; + + $block = $values['value']; + if ($this->isNextLineIndented()) { + $block .= "\n".$this->getNextEmbedBlock($this->getCurrentLineIndentation() + strlen($values['leadspaces']) + 1); + } + + $data[] = $parser->parse($block, $exceptionOnInvalidType, $objectSupport, $objectForMap); + } else { + $data[] = $this->parseValue($values['value'], $exceptionOnInvalidType, $objectSupport, $objectForMap, $context); + } + } + if ($isRef) { + $this->refs[$isRef] = end($data); + } + } elseif (preg_match('#^(?P'.Inline::REGEX_QUOTED_STRING.'|[^ \'"\[\{].*?) *\:(\s+(?P.+?))?\s*$#u', $this->currentLine, $values) && (false === strpos($values['key'], ' #') || in_array($values['key'][0], array('"', "'")))) { + if ($context && 'sequence' == $context) { + throw new ParseException('You cannot define a mapping item when in a sequence'); + } + $context = 'mapping'; + + // force correct settings + Inline::parse(null, $exceptionOnInvalidType, $objectSupport, $objectForMap, $this->refs); + try { + $key = Inline::parseScalar($values['key']); + } catch (ParseException $e) { + $e->setParsedLine($this->getRealCurrentLineNb() + 1); + $e->setSnippet($this->currentLine); + + throw $e; + } + + // Convert float keys to strings, to avoid being converted to integers by PHP + if (is_float($key)) { + $key = (string) $key; + } + + if ('<<' === $key) { + $mergeNode = true; + $allowOverwrite = true; + if (isset($values['value']) && 0 === strpos($values['value'], '*')) { + $refName = substr($values['value'], 1); + if (!array_key_exists($refName, $this->refs)) { + throw new ParseException(sprintf('Reference "%s" does not exist.', $refName), $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + + $refValue = $this->refs[$refName]; + + if (!is_array($refValue)) { + throw new ParseException('YAML merge keys used with a scalar value instead of an array.', $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + + foreach ($refValue as $key => $value) { + if (!isset($data[$key])) { + $data[$key] = $value; + } + } + } else { + if (isset($values['value']) && $values['value'] !== '') { + $value = $values['value']; + } else { + $value = $this->getNextEmbedBlock(); + } + $c = $this->getRealCurrentLineNb() + 1; + $parser = new self($c); + $parser->refs = &$this->refs; + $parsed = $parser->parse($value, $exceptionOnInvalidType, $objectSupport, $objectForMap); + + if (!is_array($parsed)) { + throw new ParseException('YAML merge keys used with a scalar value instead of an array.', $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + + if (isset($parsed[0])) { + // If the value associated with the merge key is a sequence, then this sequence is expected to contain mapping nodes + // and each of these nodes is merged in turn according to its order in the sequence. Keys in mapping nodes earlier + // in the sequence override keys specified in later mapping nodes. + foreach ($parsed as $parsedItem) { + if (!is_array($parsedItem)) { + throw new ParseException('Merge items must be arrays.', $this->getRealCurrentLineNb() + 1, $parsedItem); + } + + foreach ($parsedItem as $key => $value) { + if (!isset($data[$key])) { + $data[$key] = $value; + } + } + } + } else { + // If the value associated with the key is a single mapping node, each of its key/value pairs is inserted into the + // current mapping, unless the key already exists in it. + foreach ($parsed as $key => $value) { + if (!isset($data[$key])) { + $data[$key] = $value; + } + } + } + } + } elseif (isset($values['value']) && preg_match('#^&(?P[^ ]+) *(?P.*)#u', $values['value'], $matches)) { + $isRef = $matches['ref']; + $values['value'] = $matches['value']; + } + + if ($mergeNode) { + // Merge keys + } elseif (!isset($values['value']) || '' == trim($values['value'], ' ') || 0 === strpos(ltrim($values['value'], ' '), '#')) { + // hash + // if next line is less indented or equal, then it means that the current value is null + if (!$this->isNextLineIndented() && !$this->isNextLineUnIndentedCollection()) { + // Spec: Keys MUST be unique; first one wins. + // But overwriting is allowed when a merge node is used in current block. + if ($allowOverwrite || !isset($data[$key])) { + $data[$key] = null; + } + } else { + $c = $this->getRealCurrentLineNb() + 1; + $parser = new self($c); + $parser->refs = &$this->refs; + $value = $parser->parse($this->getNextEmbedBlock(), $exceptionOnInvalidType, $objectSupport, $objectForMap); + // Spec: Keys MUST be unique; first one wins. + // But overwriting is allowed when a merge node is used in current block. + if ($allowOverwrite || !isset($data[$key])) { + $data[$key] = $value; + } + } + } else { + $value = $this->parseValue($values['value'], $exceptionOnInvalidType, $objectSupport, $objectForMap, $context); + // Spec: Keys MUST be unique; first one wins. + // But overwriting is allowed when a merge node is used in current block. + if ($allowOverwrite || !isset($data[$key])) { + $data[$key] = $value; + } + } + if ($isRef) { + $this->refs[$isRef] = $data[$key]; + } + } else { + // multiple documents are not supported + if ('---' === $this->currentLine) { + throw new ParseException('Multiple documents are not supported.'); + } + + // 1-liner optionally followed by newline(s) + if (is_string($value) && $this->lines[0] === trim($value)) { + try { + $value = Inline::parse($this->lines[0], $exceptionOnInvalidType, $objectSupport, $objectForMap, $this->refs); + } catch (ParseException $e) { + $e->setParsedLine($this->getRealCurrentLineNb() + 1); + $e->setSnippet($this->currentLine); + + throw $e; + } + + if (is_array($value)) { + $first = reset($value); + if (is_string($first) && 0 === strpos($first, '*')) { + $data = array(); + foreach ($value as $alias) { + $data[] = $this->refs[substr($alias, 1)]; + } + $value = $data; + } + } + + if (isset($mbEncoding)) { + mb_internal_encoding($mbEncoding); + } + + return $value; + } + + switch (preg_last_error()) { + case PREG_INTERNAL_ERROR: + $error = 'Internal PCRE error.'; + break; + case PREG_BACKTRACK_LIMIT_ERROR: + $error = 'pcre.backtrack_limit reached.'; + break; + case PREG_RECURSION_LIMIT_ERROR: + $error = 'pcre.recursion_limit reached.'; + break; + case PREG_BAD_UTF8_ERROR: + $error = 'Malformed UTF-8 data.'; + break; + case PREG_BAD_UTF8_OFFSET_ERROR: + $error = 'Offset doesn\'t correspond to the begin of a valid UTF-8 code point.'; + break; + default: + $error = 'Unable to parse.'; + } + + throw new ParseException($error, $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + } + + if (isset($mbEncoding)) { + mb_internal_encoding($mbEncoding); + } + + if ($objectForMap && !is_object($data) && 'mapping' === $context) { + $object = new \stdClass(); + + foreach ($data as $key => $value) { + $object->$key = $value; + } + + $data = $object; + } + + return empty($data) ? null : $data; + } + + /** + * Returns the current line number (takes the offset into account). + * + * @return int The current line number + */ + private function getRealCurrentLineNb() + { + return $this->currentLineNb + $this->offset; + } + + /** + * Returns the current line indentation. + * + * @return int The current line indentation + */ + private function getCurrentLineIndentation() + { + return strlen($this->currentLine) - strlen(ltrim($this->currentLine, ' ')); + } + + /** + * Returns the next embed block of YAML. + * + * @param int $indentation The indent level at which the block is to be read, or null for default + * @param bool $inSequence True if the enclosing data structure is a sequence + * + * @return string A YAML string + * + * @throws ParseException When indentation problem are detected + */ + private function getNextEmbedBlock($indentation = null, $inSequence = false) + { + $oldLineIndentation = $this->getCurrentLineIndentation(); + $blockScalarIndentations = array(); + + if ($this->isBlockScalarHeader()) { + $blockScalarIndentations[] = $this->getCurrentLineIndentation(); + } + + if (!$this->moveToNextLine()) { + return; + } + + if (null === $indentation) { + $newIndent = $this->getCurrentLineIndentation(); + + $unindentedEmbedBlock = $this->isStringUnIndentedCollectionItem(); + + if (!$this->isCurrentLineEmpty() && 0 === $newIndent && !$unindentedEmbedBlock) { + throw new ParseException('Indentation problem.', $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + } else { + $newIndent = $indentation; + } + + $data = array(); + if ($this->getCurrentLineIndentation() >= $newIndent) { + $data[] = substr($this->currentLine, $newIndent); + } else { + $this->moveToPreviousLine(); + + return; + } + + if ($inSequence && $oldLineIndentation === $newIndent && isset($data[0][0]) && '-' === $data[0][0]) { + // the previous line contained a dash but no item content, this line is a sequence item with the same indentation + // and therefore no nested list or mapping + $this->moveToPreviousLine(); + + return; + } + + $isItUnindentedCollection = $this->isStringUnIndentedCollectionItem(); + + if (empty($blockScalarIndentations) && $this->isBlockScalarHeader()) { + $blockScalarIndentations[] = $this->getCurrentLineIndentation(); + } + + $previousLineIndentation = $this->getCurrentLineIndentation(); + + while ($this->moveToNextLine()) { + $indent = $this->getCurrentLineIndentation(); + + // terminate all block scalars that are more indented than the current line + if (!empty($blockScalarIndentations) && $indent < $previousLineIndentation && trim($this->currentLine) !== '') { + foreach ($blockScalarIndentations as $key => $blockScalarIndentation) { + if ($blockScalarIndentation >= $this->getCurrentLineIndentation()) { + unset($blockScalarIndentations[$key]); + } + } + } + + if (empty($blockScalarIndentations) && !$this->isCurrentLineComment() && $this->isBlockScalarHeader()) { + $blockScalarIndentations[] = $this->getCurrentLineIndentation(); + } + + $previousLineIndentation = $indent; + + if ($isItUnindentedCollection && !$this->isStringUnIndentedCollectionItem() && $newIndent === $indent) { + $this->moveToPreviousLine(); + break; + } + + if ($this->isCurrentLineBlank()) { + $data[] = substr($this->currentLine, $newIndent); + continue; + } + + // we ignore "comment" lines only when we are not inside a scalar block + if (empty($blockScalarIndentations) && $this->isCurrentLineComment()) { + continue; + } + + if ($indent >= $newIndent) { + $data[] = substr($this->currentLine, $newIndent); + } elseif (0 == $indent) { + $this->moveToPreviousLine(); + + break; + } else { + throw new ParseException('Indentation problem.', $this->getRealCurrentLineNb() + 1, $this->currentLine); + } + } + + return implode("\n", $data); + } + + /** + * Moves the parser to the next line. + * + * @return bool + */ + private function moveToNextLine() + { + if ($this->currentLineNb >= count($this->lines) - 1) { + return false; + } + + $this->currentLine = $this->lines[++$this->currentLineNb]; + + return true; + } + + /** + * Moves the parser to the previous line. + */ + private function moveToPreviousLine() + { + $this->currentLine = $this->lines[--$this->currentLineNb]; + } + + /** + * Parses a YAML value. + * + * @param string $value A YAML value + * @param bool $exceptionOnInvalidType True if an exception must be thrown on invalid types false otherwise + * @param bool $objectSupport True if object support is enabled, false otherwise + * @param bool $objectForMap true if maps should return a stdClass instead of array() + * @param string $context The parser context (either sequence or mapping) + * + * @return mixed A PHP value + * + * @throws ParseException When reference does not exist + */ + private function parseValue($value, $exceptionOnInvalidType, $objectSupport, $objectForMap, $context) + { + if (0 === strpos($value, '*')) { + if (false !== $pos = strpos($value, '#')) { + $value = substr($value, 1, $pos - 2); + } else { + $value = substr($value, 1); + } + + if (!array_key_exists($value, $this->refs)) { + throw new ParseException(sprintf('Reference "%s" does not exist.', $value), $this->currentLine); + } + + return $this->refs[$value]; + } + + if (preg_match('/^'.self::BLOCK_SCALAR_HEADER_PATTERN.'$/', $value, $matches)) { + $modifiers = isset($matches['modifiers']) ? $matches['modifiers'] : ''; + + return $this->parseBlockScalar($matches['separator'], preg_replace('#\d+#', '', $modifiers), (int) abs($modifiers)); + } + + try { + $parsedValue = Inline::parse($value, $exceptionOnInvalidType, $objectSupport, $objectForMap, $this->refs); + + if ('mapping' === $context && '"' !== $value[0] && "'" !== $value[0] && '[' !== $value[0] && '{' !== $value[0] && '!' !== $value[0] && false !== strpos($parsedValue, ': ')) { + throw new ParseException('A colon cannot be used in an unquoted mapping value.'); + } + + return $parsedValue; + } catch (ParseException $e) { + $e->setParsedLine($this->getRealCurrentLineNb() + 1); + $e->setSnippet($this->currentLine); + + throw $e; + } + } + + /** + * Parses a block scalar. + * + * @param string $style The style indicator that was used to begin this block scalar (| or >) + * @param string $chomping The chomping indicator that was used to begin this block scalar (+ or -) + * @param int $indentation The indentation indicator that was used to begin this block scalar + * + * @return string The text value + */ + private function parseBlockScalar($style, $chomping = '', $indentation = 0) + { + $notEOF = $this->moveToNextLine(); + if (!$notEOF) { + return ''; + } + + $isCurrentLineBlank = $this->isCurrentLineBlank(); + $blockLines = array(); + + // leading blank lines are consumed before determining indentation + while ($notEOF && $isCurrentLineBlank) { + // newline only if not EOF + if ($notEOF = $this->moveToNextLine()) { + $blockLines[] = ''; + $isCurrentLineBlank = $this->isCurrentLineBlank(); + } + } + + // determine indentation if not specified + if (0 === $indentation) { + if (preg_match('/^ +/', $this->currentLine, $matches)) { + $indentation = strlen($matches[0]); + } + } + + if ($indentation > 0) { + $pattern = sprintf('/^ {%d}(.*)$/', $indentation); + + while ( + $notEOF && ( + $isCurrentLineBlank || + preg_match($pattern, $this->currentLine, $matches) + ) + ) { + if ($isCurrentLineBlank && strlen($this->currentLine) > $indentation) { + $blockLines[] = substr($this->currentLine, $indentation); + } elseif ($isCurrentLineBlank) { + $blockLines[] = ''; + } else { + $blockLines[] = $matches[1]; + } + + // newline only if not EOF + if ($notEOF = $this->moveToNextLine()) { + $isCurrentLineBlank = $this->isCurrentLineBlank(); + } + } + } elseif ($notEOF) { + $blockLines[] = ''; + } + + if ($notEOF) { + $blockLines[] = ''; + $this->moveToPreviousLine(); + } + + // folded style + if ('>' === $style) { + $text = ''; + $previousLineIndented = false; + $previousLineBlank = false; + + for ($i = 0; $i < count($blockLines); ++$i) { + if ('' === $blockLines[$i]) { + $text .= "\n"; + $previousLineIndented = false; + $previousLineBlank = true; + } elseif (' ' === $blockLines[$i][0]) { + $text .= "\n".$blockLines[$i]; + $previousLineIndented = true; + $previousLineBlank = false; + } elseif ($previousLineIndented) { + $text .= "\n".$blockLines[$i]; + $previousLineIndented = false; + $previousLineBlank = false; + } elseif ($previousLineBlank || 0 === $i) { + $text .= $blockLines[$i]; + $previousLineIndented = false; + $previousLineBlank = false; + } else { + $text .= ' '.$blockLines[$i]; + $previousLineIndented = false; + $previousLineBlank = false; + } + } + } else { + $text = implode("\n", $blockLines); + } + + // deal with trailing newlines + if ('' === $chomping) { + $text = preg_replace('/\n+$/', "\n", $text); + } elseif ('-' === $chomping) { + $text = preg_replace('/\n+$/', '', $text); + } + + return $text; + } + + /** + * Returns true if the next line is indented. + * + * @return bool Returns true if the next line is indented, false otherwise + */ + private function isNextLineIndented() + { + $currentIndentation = $this->getCurrentLineIndentation(); + $EOF = !$this->moveToNextLine(); + + while (!$EOF && $this->isCurrentLineEmpty()) { + $EOF = !$this->moveToNextLine(); + } + + if ($EOF) { + return false; + } + + $ret = false; + if ($this->getCurrentLineIndentation() > $currentIndentation) { + $ret = true; + } + + $this->moveToPreviousLine(); + + return $ret; + } + + /** + * Returns true if the current line is blank or if it is a comment line. + * + * @return bool Returns true if the current line is empty or if it is a comment line, false otherwise + */ + private function isCurrentLineEmpty() + { + return $this->isCurrentLineBlank() || $this->isCurrentLineComment(); + } + + /** + * Returns true if the current line is blank. + * + * @return bool Returns true if the current line is blank, false otherwise + */ + private function isCurrentLineBlank() + { + return '' == trim($this->currentLine, ' '); + } + + /** + * Returns true if the current line is a comment line. + * + * @return bool Returns true if the current line is a comment line, false otherwise + */ + private function isCurrentLineComment() + { + //checking explicitly the first char of the trim is faster than loops or strpos + $ltrimmedLine = ltrim($this->currentLine, ' '); + + return '' !== $ltrimmedLine && $ltrimmedLine[0] === '#'; + } + + /** + * Cleanups a YAML string to be parsed. + * + * @param string $value The input YAML string + * + * @return string A cleaned up YAML string + */ + private function cleanup($value) + { + $value = str_replace(array("\r\n", "\r"), "\n", $value); + + // strip YAML header + $count = 0; + $value = preg_replace('#^\%YAML[: ][\d\.]+.*\n#u', '', $value, -1, $count); + $this->offset += $count; + + // remove leading comments + $trimmedValue = preg_replace('#^(\#.*?\n)+#s', '', $value, -1, $count); + if ($count == 1) { + // items have been removed, update the offset + $this->offset += substr_count($value, "\n") - substr_count($trimmedValue, "\n"); + $value = $trimmedValue; + } + + // remove start of the document marker (---) + $trimmedValue = preg_replace('#^\-\-\-.*?\n#s', '', $value, -1, $count); + if ($count == 1) { + // items have been removed, update the offset + $this->offset += substr_count($value, "\n") - substr_count($trimmedValue, "\n"); + $value = $trimmedValue; + + // remove end of the document marker (...) + $value = preg_replace('#\.\.\.\s*$#', '', $value); + } + + return $value; + } + + /** + * Returns true if the next line starts unindented collection. + * + * @return bool Returns true if the next line starts unindented collection, false otherwise + */ + private function isNextLineUnIndentedCollection() + { + $currentIndentation = $this->getCurrentLineIndentation(); + $notEOF = $this->moveToNextLine(); + + while ($notEOF && $this->isCurrentLineEmpty()) { + $notEOF = $this->moveToNextLine(); + } + + if (false === $notEOF) { + return false; + } + + $ret = false; + if ( + $this->getCurrentLineIndentation() == $currentIndentation + && + $this->isStringUnIndentedCollectionItem() + ) { + $ret = true; + } + + $this->moveToPreviousLine(); + + return $ret; + } + + /** + * Returns true if the string is un-indented collection item. + * + * @return bool Returns true if the string is un-indented collection item, false otherwise + */ + private function isStringUnIndentedCollectionItem() + { + return 0 === strpos($this->currentLine, '- '); + } + + /** + * Tests whether or not the current line is the header of a block scalar. + * + * @return bool + */ + private function isBlockScalarHeader() + { + return (bool) preg_match('~'.self::BLOCK_SCALAR_HEADER_PATTERN.'$~', $this->currentLine); + } +} diff --git a/Classes/DeviceDetector/Yaml/Unescaper.php b/Classes/DeviceDetector/Yaml/Unescaper.php new file mode 100644 index 0000000..62623e1 --- /dev/null +++ b/Classes/DeviceDetector/Yaml/Unescaper.php @@ -0,0 +1,141 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace WapplerSystems\ABTest2\DeviceDetector\Yaml; + + +/** + * Unescaper encapsulates unescaping rules for single and double-quoted + * YAML strings. + * + * @author Matthew Lewinski + * + * @internal + */ +class Unescaper +{ + /** + * Regex fragment that matches an escaped character in a double quoted string. + */ + const REGEX_ESCAPED_CHARACTER = '\\\\(x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8}|.)'; + + /** + * Unescapes a single quoted string. + * + * @param string $value A single quoted string. + * + * @return string The unescaped string. + */ + public function unescapeSingleQuotedString($value) + { + return str_replace('\'\'', '\'', $value); + } + + /** + * Unescapes a double quoted string. + * + * @param string $value A double quoted string. + * + * @return string The unescaped string. + */ + public function unescapeDoubleQuotedString($value) + { + $callback = function ($match) { + return $this->unescapeCharacter($match[0]); + }; + + // evaluate the string + return preg_replace_callback('/'.self::REGEX_ESCAPED_CHARACTER.'/u', $callback, $value); + } + + /** + * Unescapes a character that was found in a double-quoted string. + * + * @param string $value An escaped character + * + * @return string The unescaped character + */ + private function unescapeCharacter($value) + { + switch ($value[1]) { + case '0': + return "\x0"; + case 'a': + return "\x7"; + case 'b': + return "\x8"; + case 't': + return "\t"; + case "\t": + return "\t"; + case 'n': + return "\n"; + case 'v': + return "\xB"; + case 'f': + return "\xC"; + case 'r': + return "\r"; + case 'e': + return "\x1B"; + case ' ': + return ' '; + case '"': + return '"'; + case '/': + return '/'; + case '\\': + return '\\'; + case 'N': + // U+0085 NEXT LINE + return "\xC2\x85"; + case '_': + // U+00A0 NO-BREAK SPACE + return "\xC2\xA0"; + case 'L': + // U+2028 LINE SEPARATOR + return "\xE2\x80\xA8"; + case 'P': + // U+2029 PARAGRAPH SEPARATOR + return "\xE2\x80\xA9"; + case 'x': + return self::utf8chr(hexdec(substr($value, 2, 2))); + case 'u': + return self::utf8chr(hexdec(substr($value, 2, 4))); + case 'U': + return self::utf8chr(hexdec(substr($value, 2, 8))); + default: + throw new ParseException(sprintf('Found unknown escape character "%s".', $value)); + } + } + + /** + * Get the UTF-8 character for the given code point. + * + * @param int $c The unicode code point + * + * @return string The corresponding UTF-8 character + */ + private static function utf8chr($c) + { + if (0x80 > $c %= 0x200000) { + return chr($c); + } + if (0x800 > $c) { + return chr(0xC0 | $c >> 6).chr(0x80 | $c & 0x3F); + } + if (0x10000 > $c) { + return chr(0xE0 | $c >> 12).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F); + } + + return chr(0xF0 | $c >> 18).chr(0x80 | $c >> 12 & 0x3F).chr(0x80 | $c >> 6 & 0x3F).chr(0x80 | $c & 0x3F); + } +} diff --git a/Classes/Helper.php b/Classes/Helper.php index 53020be..016c209 100644 --- a/Classes/Helper.php +++ b/Classes/Helper.php @@ -13,6 +13,8 @@ use TYPO3\CMS\Core\Utility\GeneralUtility; use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController; use TYPO3\CMS\Frontend\Page\CacheHashCalculator; use TYPO3\CMS\Frontend\Page\PageRepository; +use WapplerSystems\ABTest2\DeviceDetector\DeviceDetector; +use WapplerSystems\ABTest2\DeviceDetector\Yaml\Parser; /** * This class detects which page version (either by cookie or by random) and sets the page content ID accordingly. @@ -32,9 +34,14 @@ class Helper */ public function determineContentId(array $params, &$tsFeController) { - - // only try to change the page if it's not the googlebot. - if (true === stripos($_SERVER['HTTP_USER_AGENT'], 'googlebot')) return; + $deviceDetector = new DeviceDetector(); + $deviceDetector->setUserAgent($_SERVER['HTTP_USER_AGENT']); + try { + $deviceDetector->setYamlParser(new Parser()); + $deviceDetector->parse(); + if ($deviceDetector->isBot()) return; + } catch (\Exception $e) { + } $currentPageId = $targetPageId = $tsFeController->id; diff --git a/Configuration/YAML/bots.yml b/Configuration/YAML/bots.yml new file mode 100644 index 0000000..ff5da52 --- /dev/null +++ b/Configuration/YAML/bots.yml @@ -0,0 +1,1572 @@ +############### +# Device Detector - The Universal Device Detection library for parsing User Agents +# +# @link http://piwik.org +# @license http://www.gnu.org/licenses/lgpl.html LGPL v3 or later +############### + +- regex: '360Spider(-Image|-Video)?' + name: '360Spider' + category: 'Search bot' + url: 'http://www.so.com/help/help_3_2.html' + producer: + name: 'Online Media Group, Inc.' + url: '' + +- regex: 'Aboundex' + name: 'Aboundexbot' + category: 'Search bot' + url: 'http://www.aboundex.com/crawler/' + producer: + name: 'Aboundex.com' + url: 'http://www.aboundex.com' + +- regex: 'AcoonBot' + name: 'Acoon' + category: 'Search bot' + url: 'http://www.acoon.de/robot.asp' + producer: + name: 'Acoon GmbH' + url: 'http://www.acoon.de' + +- regex: 'AddThis\.com' + name: 'AddThis.com' + category: 'Social Media Agent' + url: '' + producer: + name: 'Clearspring Technologies, Inc.' + url: 'http://www.clearspring.com' + +- regex: 'AhrefsBot' + name: 'aHrefs Bot' + category: 'Crawler' + url: 'http://ahrefs.com/robot' + producer: + name: 'Ahrefs Pte Ltd' + url: 'http://ahrefs.com/robot' + +- regex: 'ia_archiver|alexabot|verifybot' + name: 'Alexa Crawler' + category: 'Search bot' + url: 'https://alexa.zendesk.com/hc/en-us/sections/200100794-Crawlers' + producer: + name: 'Alexa Internet' + url: 'http://www.alexa.com' + +- regex: 'AmorankSpider' + name: 'Amorank Spider' + category: 'Crawler' + url: 'http://amorank.com/webcrawler.html' + producer: + name: 'Amorank' + url: 'http://www.amorank.com' + +- regex: 'ApacheBench' + name: 'ApacheBench' + category: 'Benchmark' + url: 'https://httpd.apache.org/docs/2.4/programs/ab.html' + producer: + name: 'The Apache Software Foundation' + url: 'http://www.apache.org/foundation/' + +- regex: 'Applebot' + name: 'Applebot' + category: 'Crawler' + url: 'http://www.apple.com/go/applebot' + producer: + name: 'Apple Inc' + url: 'http://www.apple.com' + +- regex: 'Castro 2, Episode Duration Lookup' + name: 'Castro 2' + category: 'Service Agent' + url: 'http://supertop.co/castro/' + producer: + name: 'Supertop' + url: 'http://supertop.co' + +- regex: 'Curious George' + name: 'Analytics SEO Crawler' + category: 'Crawler' + url: 'http://www.analyticsseo.com/crawler' + producer: + name: 'Analytics SEO' + url: 'http://www.analyticsseo.com' + +- regex: 'archive\.org_bot|special_archiver' + name: 'archive.org bot' + category: 'Crawler' + url: 'http://www.archive.org/details/archive.org_bot' + producer: + name: 'The Internet Archive' + url: 'http://www.archive.org' + +- regex: 'Ask Jeeves/Teoma' + name: 'Ask Jeeves' + category: 'Search bot' + url: '' + producer: + name: 'Ask Jeeves Inc.' + url: 'http://www.ask.com' + +- regex: 'Backlink-Check\.de' + name: 'Backlink-Check.de' + category: 'Crawler' + url: 'http://www.backlink-check.de/bot.html' + producer: + name: 'Mediagreen Medienservice' + url: 'http://www.backlink-check.de' + +- regex: 'BacklinkCrawler' + name: 'BacklinkCrawler' + category: 'Crawler' + url: 'http://www.backlinktest.com/crawler.html' + producer: + name: '2.0Promotion GbR' + url: 'http://www.backlinktest.com' + +- regex: 'baiduspider(-image)?|baidu Transcoder|baidu.*spider' + name: 'Baidu Spider' + category: 'Search bot' + url: 'http://www.baidu.com/search/spider.htm' + producer: + name: 'Baidu' + url: 'http://www.baidu.com' + +- regex: 'BazQux' + name: 'BazQux Reader' + url: 'https://bazqux.com/fetcher' + category: 'Feed Fetcher' + producer: + name: '' + url: '' + +- regex: 'MSNBot|msrbot|bingbot|BingPreview|msnbot-(UDiscovery|NewsBlogs)|adidxbot' + name: 'BingBot' + category: 'Search bot' + url: 'http://search.msn.com/msnbot.htmn' + producer: + name: 'Microsoft Corporation' + url: 'http://www.microsoft.com' + +- regex: 'Blekkobot' + name: 'Blekkobot' + category: 'Search bot' + url: 'http://blekko.com/about/blekkobot' + producer: + name: 'Blekko' + url: 'http://blekko.com' + +- regex: 'BLEXBot(Test)?' + name: 'BLEXBot Crawler' + category: 'Crawler' + url: 'http://webmeup-crawler.com' + producer: + name: 'WebMeUp' + url: 'http://webmeup.com' + +- regex: 'Bloglovin' + name: 'Bloglovin' + url: 'http://www.bloglovin.com' + category: 'Feed Fetcher' + producer: + name: '' + url: '' + +- regex: 'Blogtrottr' + name: 'Blogtrottr' + url: '' + category: 'Feed Fetcher' + producer: + name: 'Blogtrottr Ltd' + url: 'https://blogtrottr.com/' + +- regex: 'BountiiBot' + name: 'Bountii Bot' + category: 'Search bot' + url: 'http://bountii.com/contact.php' + producer: + name: 'Bountii Inc.' + url: 'http://bountii.com' + +- regex: 'Browsershots' + name: 'Browsershots' + category: 'Service Agent' + url: 'http://browsershots.org/faq' + producer: + name: 'Browsershots.org' + url: 'http://browsershots.org' + +- regex: 'BUbiNG' + name: 'BUbiNG' + category: 'Crawler' + url: 'http://law.di.unimi.it/BUbiNG.html' + producer: + name: 'The Laboratory for Web Algorithmics (LAW)' + url: 'http://law.di.unimi.it/software.php#buging' + +- regex: '(?