TASK: Filter unnecessary urls.
This commit is contained in:
parent
2337b7d8e4
commit
c35ea7c8ef
1 changed files with 19 additions and 1 deletions
|
@ -57,6 +57,7 @@ class CrawlerService
|
||||||
public function crawl(UrlListDto $linkList)
|
public function crawl(UrlListDto $linkList)
|
||||||
{
|
{
|
||||||
while ($url = $linkList->getNextUrl()) {
|
while ($url = $linkList->getNextUrl()) {
|
||||||
|
$uri = new Uri($url);
|
||||||
$this->driver->get($url);
|
$this->driver->get($url);
|
||||||
$screenshotHeight = $this->driver->findElement(WebDriverBy::cssSelector('body'))
|
$screenshotHeight = $this->driver->findElement(WebDriverBy::cssSelector('body'))
|
||||||
->getSize()
|
->getSize()
|
||||||
|
@ -114,7 +115,24 @@ class CrawlerService
|
||||||
'',
|
'',
|
||||||
(new Uri($this->baseUrl))->getHost(),
|
(new Uri($this->baseUrl))->getHost(),
|
||||||
];
|
];
|
||||||
|
$validSchemas = [
|
||||||
|
'http',
|
||||||
|
'https',
|
||||||
|
];
|
||||||
|
|
||||||
return in_array($uri->getHost(), $validHosts);
|
$invalidFileExtensions = [
|
||||||
|
'.pdf',
|
||||||
|
'.jpg',
|
||||||
|
'.gif',
|
||||||
|
'.svg',
|
||||||
|
];
|
||||||
|
|
||||||
|
$pathEnding = substr($uri->getPath(), -4);
|
||||||
|
|
||||||
|
return in_array($uri->getHost(), $validHosts)
|
||||||
|
&& in_array($uri->getScheme(), $validSchemas)
|
||||||
|
&& !in_array($pathEnding, $invalidFileExtensions)
|
||||||
|
&& strpos((string) $uri, 'eID=') === false
|
||||||
|
;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue