TASK: Filter unnecessary urls.

This commit is contained in:
Daniel Siepmann 2018-10-23 14:43:24 +02:00
parent 2337b7d8e4
commit c35ea7c8ef
Signed by: Daniel Siepmann
GPG key ID: 33D6629915560EF4

View file

@ -57,6 +57,7 @@ class CrawlerService
public function crawl(UrlListDto $linkList) public function crawl(UrlListDto $linkList)
{ {
while ($url = $linkList->getNextUrl()) { while ($url = $linkList->getNextUrl()) {
$uri = new Uri($url);
$this->driver->get($url); $this->driver->get($url);
$screenshotHeight = $this->driver->findElement(WebDriverBy::cssSelector('body')) $screenshotHeight = $this->driver->findElement(WebDriverBy::cssSelector('body'))
->getSize() ->getSize()
@ -114,7 +115,24 @@ class CrawlerService
'', '',
(new Uri($this->baseUrl))->getHost(), (new Uri($this->baseUrl))->getHost(),
]; ];
$validSchemas = [
'http',
'https',
];
return in_array($uri->getHost(), $validHosts); $invalidFileExtensions = [
'.pdf',
'.jpg',
'.gif',
'.svg',
];
$pathEnding = substr($uri->getPath(), -4);
return in_array($uri->getHost(), $validHosts)
&& in_array($uri->getScheme(), $validSchemas)
&& !in_array($pathEnding, $invalidFileExtensions)
&& strpos((string) $uri, 'eID=') === false
;
} }
} }