/** * Sets the resulting web of the crawl to this report * @param zibo\library\spider\Web $web The resulting web of the crawl * @return null */ public function setWeb(Web $web) { $this->nodes = array(); $nodes = $web->getNodes(); foreach ($nodes as $node) { if (!$node->hasType(WebNode::TYPE_MAILTO) || $node->hasType(WebNode::TYPE_IGNORED)) { continue; } $this->nodes[$node->getUrl()] = $node; } ksort($this->nodes); }
/** * Sets the resulting web of the crawl to this report * @param zibo\library\spider\Web $web The resulting web of the crawl * @return null */ public function setWeb(Web $web) { $this->nodes = array(); $nodes = $web->getNodes(); foreach ($nodes as $node) { $response = $node->getResponse(); if (!$node->getError() && ($node->hasType(WebNode::TYPE_IGNORED) || $node->hasType(WebNode::TYPE_MAILTO) || $response && ($response->isRedirect() || $response->getResponseCode() == 200))) { continue; } $this->nodes[$node->getUrl()] = $node; } ksort($this->nodes); }
/** * Sets the resulting web of the crawl to this report * @param zibo\library\spider\Web $web The resulting web of the crawl * @return null */ public function setWeb(Web $web) { $this->nodes = array(); $nodes = $web->getNodes(); foreach ($nodes as $node) { if ($node->hasType(WebNode::TYPE_IGNORED)) { continue; } $response = $node->getResponse(); if ($response && $response->getResponseCode() == 200 && !$node->getError()) { $this->nodes[$node->getUrl()] = $node; } } ksort($this->nodes); }
/** * Adds all the images from the page to the web * @param zibo\library\spider\Web $web The spider web * @param zibo\library\spider\WebNode $prey The current prey in the web * @param string $baseUrl Base URL of the crawl * @param string $preyBaseUrl Base URL of the prey * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey * @return null */ protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom) { $images = $dom->getElementsByTagName('img'); foreach ($images as $image) { $url = $image->getAttribute('src'); if (!$url) { continue; } $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl); $link = $web->getNode($url); $link->addType(WebNode::TYPE_IMAGE); $link->addReference($prey); $prey->addLink($link); } }
/** * Adds all the used javascripts to the web * @param zibo\library\spider\Web $web The spider web * @param zibo\library\spider\WebNode $prey The current prey in the web * @param string $baseUrl Base URL of the crawl * @param string $preyBaseUrl Base URL of the prey * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey * @return null */ protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom) { $scripts = $dom->getElementsByTagName('script'); foreach ($scripts as $script) { $type = $script->getAttribute('type'); $url = $script->getAttribute('src'); if ($type != 'text/javascript' || !$url) { continue; } $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl); $link = $web->getNode($url); $link->addType(WebNode::TYPE_JS); $link->addReference($prey); $prey->addLink($link); } }
/** * Adds the URL's from the anchors in the page to the web * @param zibo\library\spider\Web $web The spider web * @param zibo\library\spider\WebNode $prey The current prey in the web * @param string $baseUrl Base URL of the crawl * @param string $preyBaseUrl Base URL of the prey * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey * @return null */ protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom) { $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $anchor) { $url = $anchor->getAttribute('href'); if (!$url || String::startsWith($url, '#')) { continue; } if (!String::startsWith($url, 'mailto:')) { $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl); } $node = $web->getNode($url); $node->addReference($prey); $prey->addLink($node); } }
/** * Adds the used style sheets to the web * @param zibo\library\spider\Web $web The spider web * @param zibo\library\spider\WebNode $prey The current prey in the web * @param string $baseUrl Base URL of the crawl * @param string $preyBaseUrl Base URL of the prey * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey * @return null */ protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom) { $links = $dom->getElementsByTagName('link'); foreach ($links as $link) { $type = $link->getAttribute('type'); $rel = $link->getAttribute('rel'); $url = $link->getAttribute('href'); if ($type != 'text/css' || $rel != 'stylesheet' || !$url) { continue; } $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl); $link = $web->getNode($url); $link->addType(WebNode::TYPE_CSS); $link->addReference($prey); $prey->addLink($link); } }
/** * Adds the imported style sheets to the web * @param zibo\library\spider\Web $web The spider web * @param zibo\library\spider\WebNode $prey The current prey in the web * @param string $baseUrl Base URL of the crawl * @param string $preyBaseUrl Base URL of the prey * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey * @return null */ public function bite(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom = null) { if (!$prey->hasType(WebNode::TYPE_CSS)) { return; } $response = $prey->getResponse(); if (!$response || $response->getResponseCode() != 200) { return; } $source = $response->getContent(); if (!$source) { return; } $urls = $this->getImportUrlsFromStyle($source, $baseUrl, $preyBaseUrl); foreach ($urls as $url) { $link = $web->getNode($url); $link->addType(WebNode::TYPE_CSS); $link->addReference($prey); $prey->addLink($link); } }