Esempio n. 1
0
 /**
  * Sets the resulting web of the crawl to this report
  * @param zibo\library\spider\Web $web The resulting web of the crawl
  * @return null
  */
 public function setWeb(Web $web)
 {
     $this->nodes = array();
     $nodes = $web->getNodes();
     foreach ($nodes as $node) {
         if (!$node->hasType(WebNode::TYPE_MAILTO) || $node->hasType(WebNode::TYPE_IGNORED)) {
             continue;
         }
         $this->nodes[$node->getUrl()] = $node;
     }
     ksort($this->nodes);
 }
Esempio n. 2
0
 /**
  * Sets the resulting web of the crawl to this report
  * @param zibo\library\spider\Web $web The resulting web of the crawl
  * @return null
  */
 public function setWeb(Web $web)
 {
     $this->nodes = array();
     $nodes = $web->getNodes();
     foreach ($nodes as $node) {
         $response = $node->getResponse();
         if (!$node->getError() && ($node->hasType(WebNode::TYPE_IGNORED) || $node->hasType(WebNode::TYPE_MAILTO) || $response && ($response->isRedirect() || $response->getResponseCode() == 200))) {
             continue;
         }
         $this->nodes[$node->getUrl()] = $node;
     }
     ksort($this->nodes);
 }
Esempio n. 3
0
 /**
  * Sets the resulting web of the crawl to this report
  * @param zibo\library\spider\Web $web The resulting web of the crawl
  * @return null
  */
 public function setWeb(Web $web)
 {
     $this->nodes = array();
     $nodes = $web->getNodes();
     foreach ($nodes as $node) {
         if ($node->hasType(WebNode::TYPE_IGNORED)) {
             continue;
         }
         $response = $node->getResponse();
         if ($response && $response->getResponseCode() == 200 && !$node->getError()) {
             $this->nodes[$node->getUrl()] = $node;
         }
     }
     ksort($this->nodes);
 }
 /**
  * Adds all the images from the page to the web
  * @param zibo\library\spider\Web $web The spider web
  * @param zibo\library\spider\WebNode $prey The current prey in the web
  * @param string $baseUrl Base URL of the crawl
  * @param string $preyBaseUrl Base URL of the prey
  * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey
  * @return null
  */
 protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom)
 {
     $images = $dom->getElementsByTagName('img');
     foreach ($images as $image) {
         $url = $image->getAttribute('src');
         if (!$url) {
             continue;
         }
         $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl);
         $link = $web->getNode($url);
         $link->addType(WebNode::TYPE_IMAGE);
         $link->addReference($prey);
         $prey->addLink($link);
     }
 }
Esempio n. 5
0
 /**
  * Adds all the used javascripts to the web
  * @param zibo\library\spider\Web $web The spider web
  * @param zibo\library\spider\WebNode $prey The current prey in the web
  * @param string $baseUrl Base URL of the crawl
  * @param string $preyBaseUrl Base URL of the prey
  * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey
  * @return null
  */
 protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom)
 {
     $scripts = $dom->getElementsByTagName('script');
     foreach ($scripts as $script) {
         $type = $script->getAttribute('type');
         $url = $script->getAttribute('src');
         if ($type != 'text/javascript' || !$url) {
             continue;
         }
         $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl);
         $link = $web->getNode($url);
         $link->addType(WebNode::TYPE_JS);
         $link->addReference($prey);
         $prey->addLink($link);
     }
 }
 /**
  * Adds the URL's from the anchors in the page to the web
  * @param zibo\library\spider\Web $web The spider web
  * @param zibo\library\spider\WebNode $prey The current prey in the web
  * @param string $baseUrl Base URL of the crawl
  * @param string $preyBaseUrl Base URL of the prey
  * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey
  * @return null
  */
 protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom)
 {
     $anchors = $dom->getElementsByTagName('a');
     foreach ($anchors as $anchor) {
         $url = $anchor->getAttribute('href');
         if (!$url || String::startsWith($url, '#')) {
             continue;
         }
         if (!String::startsWith($url, 'mailto:')) {
             $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl);
         }
         $node = $web->getNode($url);
         $node->addReference($prey);
         $prey->addLink($node);
     }
 }
Esempio n. 7
0
 /**
  * Adds the used style sheets to the web
  * @param zibo\library\spider\Web $web The spider web
  * @param zibo\library\spider\WebNode $prey The current prey in the web
  * @param string $baseUrl Base URL of the crawl
  * @param string $preyBaseUrl Base URL of the prey
  * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey
  * @return null
  */
 protected function biteDocument(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom)
 {
     $links = $dom->getElementsByTagName('link');
     foreach ($links as $link) {
         $type = $link->getAttribute('type');
         $rel = $link->getAttribute('rel');
         $url = $link->getAttribute('href');
         if ($type != 'text/css' || $rel != 'stylesheet' || !$url) {
             continue;
         }
         $url = $this->getAbsoluteUrl($url, $baseUrl, $preyBaseUrl);
         $link = $web->getNode($url);
         $link->addType(WebNode::TYPE_CSS);
         $link->addReference($prey);
         $prey->addLink($link);
     }
 }
 /**
  * Adds the imported style sheets to the web
  * @param zibo\library\spider\Web $web The spider web
  * @param zibo\library\spider\WebNode $prey The current prey in the web
  * @param string $baseUrl Base URL of the crawl
  * @param string $preyBaseUrl Base URL of the prey
  * @param zibo\library\xml\dom\Document $dom The DOM document of the current prey
  * @return null
  */
 public function bite(Web $web, WebNode $prey, $baseUrl, $preyBaseUrl, Document $dom = null)
 {
     if (!$prey->hasType(WebNode::TYPE_CSS)) {
         return;
     }
     $response = $prey->getResponse();
     if (!$response || $response->getResponseCode() != 200) {
         return;
     }
     $source = $response->getContent();
     if (!$source) {
         return;
     }
     $urls = $this->getImportUrlsFromStyle($source, $baseUrl, $preyBaseUrl);
     foreach ($urls as $url) {
         $link = $web->getNode($url);
         $link->addType(WebNode::TYPE_CSS);
         $link->addReference($prey);
         $prey->addLink($link);
     }
 }