public function run() { if ($this->debugMode) { echo "Restricting crawl to {$this->domain}\n"; } //loop across available items in the queue of pages to crawl while (!$this->queue->isEmpty()) { if (isset($this->limit) && $this->counter >= $this->limit) { break; } $this->counter++; //get a new url to crawl $url = $this->queue->pop(); if ($this->debugMode) { echo "Queue Length: " . $this->queue->queueLength() . "\n"; echo "Crawling " . $url . "\n"; } //set the url into the http client $this->client->setUri($url); //make the request to the remote server $this->currentResponse = $this->client->request(); //don't bother trying to parse this if it's not text if (stripos($this->currentResponse->getHeader('Content-type'), 'text') === false) { continue; } //search for <a> tags in the document $body = $this->currentResponse->getBody(); $linksQuery = new Zend_Dom_Query($body); $links = $linksQuery->query('a'); if ($this->debugMode) { echo "\tFound " . count($links) . " links...\n"; } foreach ($links as $link) { //get the href of the link and find out if it links to the current host $href = $link->getAttribute('href'); $urlparts = parse_url($href); if ($this->stayOnDomain && isset($urlparts["host"]) && $urlparts["host"] != $this->domain) { continue; } //if it's an absolute link without a domain or a scheme, attempt to fix it if (!isset($urlparts["host"])) { $href = 'http://' . $this->domain . $href; //this is a really naive way of doing this! } //push this link into the queue to be crawled $this->queue->push($href); } //for each page that we see, run every registered task across it foreach ($this->tasks as $task) { $task->task($this->currentResponse, $this->client); } } //after we're done with everything, call the shutdown hook on all the tasks $this->shutdownTasks(); }
/** * Sends a request and returns a response * * @param CartRecover_Request $request * @return Cart_Recover_Response */ public function sendRequest(CartRecover_Request $request) { $this->client->setUri($request->getUri()); $this->client->setParameterGet($request->getParams()); $this->client->setMethod($request->getMethod()); $this->client->setHeaders('Accept', 'application/json'); $this->response = $this->client->request(); if ($this->response->getHeader('Content-Type') != 'application/json') { throw new CartRecover_Exception_UnexpectedValueException("Unknown response format."); } $body = json_decode($this->response->getBody(), true); $response = new CartRecover_Response(); $response->setRawResponse($this->response->asString()); $response->setBody($body); $response->setHeaders($this->response->getHeaders()); $response->setStatus($this->response->getMessage(), $this->response->getStatus()); return $response; }
protected function createResponse(\Zend_Http_Response $response) { $headers = array($response->getHeader('Set-Cookie')); $cookies = array(); foreach ($headers as $header) { if (!trim($header)) { continue; } $parts = explode(';', $header); $value = array_shift($parts); list($name, $value) = explode('=', trim($value)); $cookies[$name] = array('value' => $value); foreach ($parts as $part) { list($key, $value) = explode('=', trim($part)); $cookies[$name][$key] = $value; } } return new Response($response->getBody(), $response->getStatus(), $response->getHeaders(), $cookies); }
/** * Test that headers are properly parsed when passed to the constructor as * an indexed array with no whitespace after the ':' sign * * @link http://framework.zend.com/issues/browse/ZF-10277 * @group ZF-10277 */ public function testConstructorWithHeadersIndexedArrayNoWhitespace() { $response = new Zend_Http_Response(200, array('content-type:text/plain', 'x-foo:bar:baz')); $this->assertEquals('text/plain', $response->getHeader('content-type')); $this->assertEquals('bar:baz', $response->getHeader('x-foo')); }
protected function _previewHtml($uri, Zend_Http_Response $response) { $body = $response->getBody(); $body = trim($body); if (preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getHeader('content-type'), $matches) || preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getBody(), $matches)) { $this->view->charset = $charset = trim($matches[1]); } else { $this->view->charset = $charset = 'UTF-8'; } // if( function_exists('mb_convert_encoding') ) { // $body = mb_convert_encoding($body, 'HTML-ENTITIES', $charset); // } // Get DOM if (class_exists('DOMDocument')) { $dom = new Zend_Dom_Query($body); } else { $dom = null; // Maybe add b/c later } $title = null; if ($dom) { $titleList = $dom->query('title'); if (count($titleList) > 0) { $title = trim($titleList->current()->textContent); $title = substr($title, 0, 255); } } $this->view->title = $title; $description = null; if ($dom) { $descriptionList = $dom->queryXpath("//meta[@name='description']"); // Why are they using caps? -_- if (count($descriptionList) == 0) { $descriptionList = $dom->queryXpath("//meta[@name='Description']"); } if (count($descriptionList) > 0) { $description = trim($descriptionList->current()->getAttribute('content')); $description = substr($description, 0, 255); } } $this->view->description = $description; $thumb = null; if ($dom) { $thumbList = $dom->queryXpath("//link[@rel='image_src']"); if (count($thumbList) > 0) { $thumb = $thumbList->current()->getAttribute('href'); } } $this->view->thumb = $thumb; $medium = null; if ($dom) { $mediumList = $dom->queryXpath("//meta[@name='medium']"); if (count($mediumList) > 0) { $medium = $mediumList->current()->getAttribute('content'); } } $this->view->medium = $medium; // Get baseUrl and baseHref to parse . paths $baseUrlInfo = parse_url($uri); $baseUrl = null; $baseHostUrl = null; if ($dom) { $baseUrlList = $dom->query('base'); if ($baseUrlList && count($baseUrlList) > 0 && $baseUrlList->current()->getAttribute('href')) { $baseUrl = $baseUrlList->current()->getAttribute('href'); $baseUrlInfo = parse_url($baseUrl); $baseHostUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/'; } } if (!$baseUrl) { $baseHostUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/'; if (empty($baseUrlInfo['path'])) { $baseUrl = $baseHostUrl; } else { $baseUrl = explode('/', $baseUrlInfo['path']); array_pop($baseUrl); $baseUrl = join('/', $baseUrl); $baseUrl = trim($baseUrl, '/'); $baseUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/' . $baseUrl . '/'; } } $images = array(); if ($thumb) { $images[] = $thumb; } if ($dom) { $imageQuery = $dom->query('img'); foreach ($imageQuery as $image) { $src = $image->getAttribute('src'); // Ignore images that don't have a src if (!$src || false === ($srcInfo = @parse_url($src))) { continue; } $ext = ltrim(strrchr($src, '.'), '.'); // Detect absolute url if (strpos($src, '/') === 0) { // If relative to root, add host $src = $baseHostUrl . ltrim($src, '/'); } else { if (strpos($src, './') === 0) { // If relative to current path, add baseUrl $src = $baseUrl . substr($src, 2); } else { if (!empty($srcInfo['scheme']) && !empty($srcInfo['host'])) { // Contians host and scheme, do nothing } else { if (empty($srcInfo['scheme']) && empty($srcInfo['host'])) { // if not contains scheme or host, add base $src = $baseUrl . ltrim($src, '/'); } else { if (empty($srcInfo['scheme']) && !empty($srcInfo['host'])) { // if contains host, but not scheme, add scheme? $src = $baseUrlInfo['scheme'] . ltrim($src, '/'); } else { // Just add base $src = $baseUrl . ltrim($src, '/'); } } } } } // Ignore images that don't come from the same domain //if( strpos($src, $srcInfo['host']) === false ) { // @todo should we do this? disabled for now //continue; //} // Ignore images that don't end in an image extension if (!in_array($ext, array('jpg', 'jpeg', 'gif', 'png'))) { // @todo should we do this? disabled for now //continue; } if (!in_array($src, $images)) { $images[] = $src; } } } // Unique $images = array_values(array_unique($images)); // Truncate if greater than 20 if (count($images) > 30) { array_splice($images, 30, count($images)); } $this->view->imageCount = count($images); $this->view->images = $images; }
/** * Try to find the document's language by first looking for Content-Language in Http headers than in html * attribute and last in content-language meta tag * @param Zend_Http_Response $response * @return string */ protected function getLanguageFromResponse($response) { $l = $response->getHeader("Content-Language"); if (empty($l)) { //try html lang attribute $languages = array(); preg_match_all('@<html[\\n|\\r\\n]*.*?[\\n|\\r\\n]*lang="(?P<language>\\S+)"[\\n|\\r\\n]*.*?[\\n|\\r\\n]*>@si', $response->getBody(), $languages); if ($languages['language']) { $l = str_replace(array("_", "-"), "", $languages['language'][0]); } } if (empty($l)) { //try meta tag $languages = array(); preg_match_all('@<meta\\shttp-equiv="content-language"\\scontent="(?P<language>\\S+)"\\s\\/>@si', $response->getBody(), $languages); if ($languages['language']) { //for lucene index remove "_" - this causes tokenization $l = str_replace("_", "", $languages['language'][0]); } } return $l; }
/** * @throws OpenSocial_Rest_Exception * @param $serviceType * @param Zend_Http_Response $response * @return array */ protected function _mapResponseToModels($serviceType, Zend_Http_Response $response) { if (strpos($response->getHeader('Content-Type'), 'application/json') !== 0) { throw new OpenSocial_Rest_Exception("Unknown Content-Type for response:<br /> " . var_export($response, true) . ' with body: ' . var_export($response->getBody(), true) . ' for request: ' . $this->_httpClient->getLastRequest()); } $modelClass = 'OpenSocial_Model_' . $this->_getModelTypeForService($serviceType); if (!class_exists($modelClass, true)) { throw new OpenSocial_Rest_Exception("Model class {$modelClass} not found for service {$serviceType}!"); } /** * @var OpenSocial_Rest_Mapper_Interface $mapper */ $mapper = new OpenSocial_Rest_Mapper_Json($modelClass); return $mapper->map($response->getBody()); }
/** * Parse an HTTP response, adding all the cookies set in that response * to the cookie jar. * * @param Zend_Http_Response $response * @param Zend_Uri_Http|string $ref_uri Requested URI */ public function addCookiesFromResponse($response, $ref_uri) { if (!$response instanceof Sabel_Http_Response) { $message = __METHOD__ . "() \$response is expected to be a Response object."; throw new Sabel_Exception_Runtime($message); } $cookie_hdrs = $response->getHeader("Set-Cookie"); if (is_array($cookie_hdrs)) { foreach ($cookie_hdrs as $cookie) { $this->addCookie($cookie, $ref_uri); } } elseif (is_string($cookie_hdrs)) { $this->addCookie($cookie_hdrs, $ref_uri); } }
/** * Parse an HTTP response, adding all the cookies set in that response * to the cookie jar. * * @param Zend_Http_Response $response * @param Zend_Uri_Http|string $ref_uri Requested URI */ public function addCookiesFromResponse($response, $ref_uri) { $cookie_hdrs = $response->getHeader('Set-Cookie'); if (is_array($cookie_hdrs)) { foreach ($cookie_hdrs as $cookie) { $this->addCookie($cookie, $ref_uri); } } elseif (is_string($cookie_hdrs)) { $this->addCookie($cookie_hdrs, $ref_uri); } }
protected function _previewHtml($uri, Zend_Http_Response $response) { $arr_return = array(); $body = $response->getBody(); $body = trim($body); if (preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getHeader('content-type'), $matches) || preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getBody(), $matches)) { $charset = trim($matches[1]); } else { $charset = 'UTF-8'; } if (function_exists('mb_convert_encoding')) { $body = mb_convert_encoding($body, 'HTML-ENTITIES', $charset); } // Get DOM if (class_exists('DOMDocument')) { $dom = new Zend_Dom_Query($body); } else { $dom = null; // Maybe add b/c later } $title = null; if ($dom) { $titleList = $dom->query('title'); if (count($titleList) > 0) { $title = trim($titleList->current()->textContent); $title = substr($title, 0, 255); } } $arr_return['title'] = $title; $description = null; if ($dom) { $descriptionList = $dom->queryXpath("//meta[@name='description']"); // Why are they using caps? -_- if (count($descriptionList) == 0) { $descriptionList = $dom->queryXpath("//meta[@name='Description']"); } if (count($descriptionList) > 0) { $description = trim($descriptionList->current()->getAttribute('content')); $description = substr($description, 0, 255); } } $arr_return['description'] = $description; $thumb = null; if ($dom) { $mediumList = $dom->queryXpath("//meta[@property='og:image']"); if (count($mediumList) > 0) { $thumb = $mediumList->current()->getAttribute('content'); } if (!$thumb) { $thumbList = $dom->queryXpath("//link[@rel='image_src']"); if (count($thumbList) > 0) { $thumb = $thumbList->current()->getAttribute('href'); } } } $arr_return['thumb'] = $thumb; return $arr_return; }