public function extract(\HTTP_Request2_Response $res) { $url = $res->getEffectiveUrl(); $base = new \Net_URL2($url); $sx = simplexml_load_string($res->getBody()); $linkInfos = array(); $alreadySeen = array(); foreach ($sx->entry as $entry) { $linkTitle = (string) $entry->title; foreach ($entry->link as $xlink) { $linkUrl = (string) $base->resolve((string) $xlink['href']); if (isset($alreadySeen[$linkUrl])) { continue; } if ($xlink['rel'] == 'alternate') { $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); } $alreadySeen[$linkUrl] = true; } } return $linkInfos; }
/** * Adds cookies set in HTTP response to the jar * * @param HTTP_Request2_Response $response HTTP response message * @param Net_URL2 $setter original request URL, needed for * setting default domain/path. If not given, * effective URL from response will be used. * * @return bool whether all cookies were successfully stored * @throws HTTP_Request2_LogicException */ public function addCookiesFromResponse(HTTP_Request2_Response $response, Net_URL2 $setter = null) { if (null === $setter) { if (!($effectiveUrl = $response->getEffectiveUrl())) { throw new HTTP_Request2_LogicException('Response URL required for adding cookies from response', HTTP_Request2_Exception::MISSING_VALUE); } $setter = new Net_URL2($effectiveUrl); } $success = true; foreach ($response->getCookies() as $cookie) { $success = $this->store($cookie, $setter) && $success; } return $success; }
public function extract(\HTTP_Request2_Response $res) { $url = Helper::removeAnchor($res->getEffectiveUrl()); $linkInfos = array(); //FIXME: mime type switch for cdata $doc = new \DOMDocument(); //@ to hide parse warning messages in invalid html @$doc->loadHTML($res->getBody()); //FIXME: extract base url from html $base = new \Net_URL2($url); $dx = new \DOMXPath($doc); $xbase = $dx->evaluate('/html/head/base[@href]')->item(0); if ($xbase) { $base = $base->resolve($xbase->attributes->getNamedItem('href')->textContent); } $meta = $dx->evaluate('/html/head/meta[@name="robots" and @content]')->item(0); if ($meta) { $robots = $meta->attributes->getNamedItem('content')->textContent; foreach (explode(',', $robots) as $value) { if (trim($value) == 'nofollow') { //we shall not follow the links return array(); } } } $links = $dx->evaluate('//a'); //FIXME: link rel, img, video $alreadySeen = array($url => true); foreach ($links as $link) { $linkTitle = Helper::sanitizeTitle($link->textContent); $href = ''; foreach ($link->attributes as $attribute) { if ($attribute->name == 'href') { $href = $attribute->textContent; } else { if ($attribute->name == 'rel') { foreach (explode(',', $attribute->textContent) as $value) { if (trim($value) == 'nofollow') { //we shall not follow this link continue 3; } } } } } if ($href == '' || $href[0] == '#') { //link on this page continue; } $linkUrlObj = $base->resolve($href); $linkUrlObj->setFragment(false); $linkUrl = (string) $linkUrlObj; if (isset($alreadySeen[$linkUrl])) { continue; } switch ($linkUrlObj->getScheme()) { case 'http': case 'https': break; default: continue 2; } //FIXME: check target type $linkInfos[] = new LinkInfo($linkUrl, $linkTitle, $url); $alreadySeen[$linkUrl] = true; } return $linkInfos; }