Beispiel #1
0
 public function extract(\HTTP_Request2_Response $res)
 {
     $url = $res->getEffectiveUrl();
     $base = new \Net_URL2($url);
     $sx = simplexml_load_string($res->getBody());
     $linkInfos = array();
     $alreadySeen = array();
     foreach ($sx->entry as $entry) {
         $linkTitle = (string) $entry->title;
         foreach ($entry->link as $xlink) {
             $linkUrl = (string) $base->resolve((string) $xlink['href']);
             if (isset($alreadySeen[$linkUrl])) {
                 continue;
             }
             if ($xlink['rel'] == 'alternate') {
                 $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
             }
             $alreadySeen[$linkUrl] = true;
         }
     }
     return $linkInfos;
 }
Beispiel #2
0
 /**
  * Adds cookies set in HTTP response to the jar
  *
  * @param HTTP_Request2_Response $response HTTP response message
  * @param Net_URL2               $setter   original request URL, needed for
  *                               setting default domain/path. If not given,
  *                               effective URL from response will be used.
  *
  * @return bool whether all cookies were successfully stored
  * @throws HTTP_Request2_LogicException
  */
 public function addCookiesFromResponse(HTTP_Request2_Response $response, Net_URL2 $setter = null)
 {
     if (null === $setter) {
         if (!($effectiveUrl = $response->getEffectiveUrl())) {
             throw new HTTP_Request2_LogicException('Response URL required for adding cookies from response', HTTP_Request2_Exception::MISSING_VALUE);
         }
         $setter = new Net_URL2($effectiveUrl);
     }
     $success = true;
     foreach ($response->getCookies() as $cookie) {
         $success = $this->store($cookie, $setter) && $success;
     }
     return $success;
 }
Beispiel #3
0
 public function extract(\HTTP_Request2_Response $res)
 {
     $url = Helper::removeAnchor($res->getEffectiveUrl());
     $linkInfos = array();
     //FIXME: mime type switch for cdata
     $doc = new \DOMDocument();
     //@ to hide parse warning messages in invalid html
     @$doc->loadHTML($res->getBody());
     //FIXME: extract base url from html
     $base = new \Net_URL2($url);
     $dx = new \DOMXPath($doc);
     $xbase = $dx->evaluate('/html/head/base[@href]')->item(0);
     if ($xbase) {
         $base = $base->resolve($xbase->attributes->getNamedItem('href')->textContent);
     }
     $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')->item(0);
     if ($meta) {
         $robots = $meta->attributes->getNamedItem('content')->textContent;
         foreach (explode(',', $robots) as $value) {
             if (trim($value) == 'nofollow') {
                 //we shall not follow the links
                 return array();
             }
         }
     }
     $links = $dx->evaluate('//a');
     //FIXME: link rel, img, video
     $alreadySeen = array($url => true);
     foreach ($links as $link) {
         $linkTitle = Helper::sanitizeTitle($link->textContent);
         $href = '';
         foreach ($link->attributes as $attribute) {
             if ($attribute->name == 'href') {
                 $href = $attribute->textContent;
             } else {
                 if ($attribute->name == 'rel') {
                     foreach (explode(',', $attribute->textContent) as $value) {
                         if (trim($value) == 'nofollow') {
                             //we shall not follow this link
                             continue 3;
                         }
                     }
                 }
             }
         }
         if ($href == '' || $href[0] == '#') {
             //link on this page
             continue;
         }
         $linkUrlObj = $base->resolve($href);
         $linkUrlObj->setFragment(false);
         $linkUrl = (string) $linkUrlObj;
         if (isset($alreadySeen[$linkUrl])) {
             continue;
         }
         switch ($linkUrlObj->getScheme()) {
             case 'http':
             case 'https':
                 break;
             default:
                 continue 2;
         }
         //FIXME: check target type
         $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url);
         $alreadySeen[$linkUrl] = true;
     }
     return $linkInfos;
 }