예제 #1
0
 public function run()
 {
     if ($this->debugMode) {
         echo "Restricting crawl to {$this->domain}\n";
     }
     //loop across available items in the queue of pages to crawl
     while (!$this->queue->isEmpty()) {
         if (isset($this->limit) && $this->counter >= $this->limit) {
             break;
         }
         $this->counter++;
         //get a new url to crawl
         $url = $this->queue->pop();
         if ($this->debugMode) {
             echo "Queue Length: " . $this->queue->queueLength() . "\n";
             echo "Crawling " . $url . "\n";
         }
         //set the url into the http client
         $this->client->setUri($url);
         //make the request to the remote server
         $this->currentResponse = $this->client->request();
         //don't bother trying to parse this if it's not text
         if (stripos($this->currentResponse->getHeader('Content-type'), 'text') === false) {
             continue;
         }
         //search for <a> tags in the document
         $body = $this->currentResponse->getBody();
         $linksQuery = new Zend_Dom_Query($body);
         $links = $linksQuery->query('a');
         if ($this->debugMode) {
             echo "\tFound " . count($links) . " links...\n";
         }
         foreach ($links as $link) {
             //get the href of the link and find out if it links to the current host
             $href = $link->getAttribute('href');
             $urlparts = parse_url($href);
             if ($this->stayOnDomain && isset($urlparts["host"]) && $urlparts["host"] != $this->domain) {
                 continue;
             }
             //if it's an absolute link without a domain or a scheme, attempt to fix it
             if (!isset($urlparts["host"])) {
                 $href = 'http://' . $this->domain . $href;
                 //this is a really naive way of doing this!
             }
             //push this link into the queue to be crawled
             $this->queue->push($href);
         }
         //for each page that we see, run every registered task across it
         foreach ($this->tasks as $task) {
             $task->task($this->currentResponse, $this->client);
         }
     }
     //after we're done with everything, call the shutdown hook on all the tasks
     $this->shutdownTasks();
 }
예제 #2
0
 /**
  * Sends a request and returns a response
  *
  * @param CartRecover_Request $request
  * @return Cart_Recover_Response
  */
 public function sendRequest(CartRecover_Request $request)
 {
     $this->client->setUri($request->getUri());
     $this->client->setParameterGet($request->getParams());
     $this->client->setMethod($request->getMethod());
     $this->client->setHeaders('Accept', 'application/json');
     $this->response = $this->client->request();
     if ($this->response->getHeader('Content-Type') != 'application/json') {
         throw new CartRecover_Exception_UnexpectedValueException("Unknown response format.");
     }
     $body = json_decode($this->response->getBody(), true);
     $response = new CartRecover_Response();
     $response->setRawResponse($this->response->asString());
     $response->setBody($body);
     $response->setHeaders($this->response->getHeaders());
     $response->setStatus($this->response->getMessage(), $this->response->getStatus());
     return $response;
 }
예제 #3
0
파일: Client.php 프로젝트: andreia/Goutte
 protected function createResponse(\Zend_Http_Response $response)
 {
     $headers = array($response->getHeader('Set-Cookie'));
     $cookies = array();
     foreach ($headers as $header) {
         if (!trim($header)) {
             continue;
         }
         $parts = explode(';', $header);
         $value = array_shift($parts);
         list($name, $value) = explode('=', trim($value));
         $cookies[$name] = array('value' => $value);
         foreach ($parts as $part) {
             list($key, $value) = explode('=', trim($part));
             $cookies[$name][$key] = $value;
         }
     }
     return new Response($response->getBody(), $response->getStatus(), $response->getHeaders(), $cookies);
 }
예제 #4
0
 /**
  * Test that headers are properly parsed when passed to the constructor as
  * an indexed array with no whitespace after the ':' sign
  *
  * @link  http://framework.zend.com/issues/browse/ZF-10277
  * @group ZF-10277
  */
 public function testConstructorWithHeadersIndexedArrayNoWhitespace()
 {
     $response = new Zend_Http_Response(200, array('content-type:text/plain', 'x-foo:bar:baz'));
     $this->assertEquals('text/plain', $response->getHeader('content-type'));
     $this->assertEquals('bar:baz', $response->getHeader('x-foo'));
 }
예제 #5
0
 protected function _previewHtml($uri, Zend_Http_Response $response)
 {
     $body = $response->getBody();
     $body = trim($body);
     if (preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getHeader('content-type'), $matches) || preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getBody(), $matches)) {
         $this->view->charset = $charset = trim($matches[1]);
     } else {
         $this->view->charset = $charset = 'UTF-8';
     }
     //    if( function_exists('mb_convert_encoding') ) {
     //      $body = mb_convert_encoding($body, 'HTML-ENTITIES', $charset);
     //    }
     // Get DOM
     if (class_exists('DOMDocument')) {
         $dom = new Zend_Dom_Query($body);
     } else {
         $dom = null;
         // Maybe add b/c later
     }
     $title = null;
     if ($dom) {
         $titleList = $dom->query('title');
         if (count($titleList) > 0) {
             $title = trim($titleList->current()->textContent);
             $title = substr($title, 0, 255);
         }
     }
     $this->view->title = $title;
     $description = null;
     if ($dom) {
         $descriptionList = $dom->queryXpath("//meta[@name='description']");
         // Why are they using caps? -_-
         if (count($descriptionList) == 0) {
             $descriptionList = $dom->queryXpath("//meta[@name='Description']");
         }
         if (count($descriptionList) > 0) {
             $description = trim($descriptionList->current()->getAttribute('content'));
             $description = substr($description, 0, 255);
         }
     }
     $this->view->description = $description;
     $thumb = null;
     if ($dom) {
         $thumbList = $dom->queryXpath("//link[@rel='image_src']");
         if (count($thumbList) > 0) {
             $thumb = $thumbList->current()->getAttribute('href');
         }
     }
     $this->view->thumb = $thumb;
     $medium = null;
     if ($dom) {
         $mediumList = $dom->queryXpath("//meta[@name='medium']");
         if (count($mediumList) > 0) {
             $medium = $mediumList->current()->getAttribute('content');
         }
     }
     $this->view->medium = $medium;
     // Get baseUrl and baseHref to parse . paths
     $baseUrlInfo = parse_url($uri);
     $baseUrl = null;
     $baseHostUrl = null;
     if ($dom) {
         $baseUrlList = $dom->query('base');
         if ($baseUrlList && count($baseUrlList) > 0 && $baseUrlList->current()->getAttribute('href')) {
             $baseUrl = $baseUrlList->current()->getAttribute('href');
             $baseUrlInfo = parse_url($baseUrl);
             $baseHostUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/';
         }
     }
     if (!$baseUrl) {
         $baseHostUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/';
         if (empty($baseUrlInfo['path'])) {
             $baseUrl = $baseHostUrl;
         } else {
             $baseUrl = explode('/', $baseUrlInfo['path']);
             array_pop($baseUrl);
             $baseUrl = join('/', $baseUrl);
             $baseUrl = trim($baseUrl, '/');
             $baseUrl = $baseUrlInfo['scheme'] . '://' . $baseUrlInfo['host'] . '/' . $baseUrl . '/';
         }
     }
     $images = array();
     if ($thumb) {
         $images[] = $thumb;
     }
     if ($dom) {
         $imageQuery = $dom->query('img');
         foreach ($imageQuery as $image) {
             $src = $image->getAttribute('src');
             // Ignore images that don't have a src
             if (!$src || false === ($srcInfo = @parse_url($src))) {
                 continue;
             }
             $ext = ltrim(strrchr($src, '.'), '.');
             // Detect absolute url
             if (strpos($src, '/') === 0) {
                 // If relative to root, add host
                 $src = $baseHostUrl . ltrim($src, '/');
             } else {
                 if (strpos($src, './') === 0) {
                     // If relative to current path, add baseUrl
                     $src = $baseUrl . substr($src, 2);
                 } else {
                     if (!empty($srcInfo['scheme']) && !empty($srcInfo['host'])) {
                         // Contians host and scheme, do nothing
                     } else {
                         if (empty($srcInfo['scheme']) && empty($srcInfo['host'])) {
                             // if not contains scheme or host, add base
                             $src = $baseUrl . ltrim($src, '/');
                         } else {
                             if (empty($srcInfo['scheme']) && !empty($srcInfo['host'])) {
                                 // if contains host, but not scheme, add scheme?
                                 $src = $baseUrlInfo['scheme'] . ltrim($src, '/');
                             } else {
                                 // Just add base
                                 $src = $baseUrl . ltrim($src, '/');
                             }
                         }
                     }
                 }
             }
             // Ignore images that don't come from the same domain
             //if( strpos($src, $srcInfo['host']) === false ) {
             // @todo should we do this? disabled for now
             //continue;
             //}
             // Ignore images that don't end in an image extension
             if (!in_array($ext, array('jpg', 'jpeg', 'gif', 'png'))) {
                 // @todo should we do this? disabled for now
                 //continue;
             }
             if (!in_array($src, $images)) {
                 $images[] = $src;
             }
         }
     }
     // Unique
     $images = array_values(array_unique($images));
     // Truncate if greater than 20
     if (count($images) > 30) {
         array_splice($images, 30, count($images));
     }
     $this->view->imageCount = count($images);
     $this->view->images = $images;
 }
예제 #6
0
 /**
  * Try to find the document's language by first looking for Content-Language in Http headers than in html
  * attribute and last in content-language meta tag
  * @param  Zend_Http_Response $response
  * @return string
  */
 protected function getLanguageFromResponse($response)
 {
     $l = $response->getHeader("Content-Language");
     if (empty($l)) {
         //try html lang attribute
         $languages = array();
         preg_match_all('@<html[\\n|\\r\\n]*.*?[\\n|\\r\\n]*lang="(?P<language>\\S+)"[\\n|\\r\\n]*.*?[\\n|\\r\\n]*>@si', $response->getBody(), $languages);
         if ($languages['language']) {
             $l = str_replace(array("_", "-"), "", $languages['language'][0]);
         }
     }
     if (empty($l)) {
         //try meta tag
         $languages = array();
         preg_match_all('@<meta\\shttp-equiv="content-language"\\scontent="(?P<language>\\S+)"\\s\\/>@si', $response->getBody(), $languages);
         if ($languages['language']) {
             //for lucene index remove "_" - this causes tokenization
             $l = str_replace("_", "", $languages['language'][0]);
         }
     }
     return $l;
 }
 /**
  * @throws OpenSocial_Rest_Exception
  * @param  $serviceType
  * @param Zend_Http_Response $response
  * @return array
  */
 protected function _mapResponseToModels($serviceType, Zend_Http_Response $response)
 {
     if (strpos($response->getHeader('Content-Type'), 'application/json') !== 0) {
         throw new OpenSocial_Rest_Exception("Unknown Content-Type for response:<br /> " . var_export($response, true) . ' with body: ' . var_export($response->getBody(), true) . ' for request: ' . $this->_httpClient->getLastRequest());
     }
     $modelClass = 'OpenSocial_Model_' . $this->_getModelTypeForService($serviceType);
     if (!class_exists($modelClass, true)) {
         throw new OpenSocial_Rest_Exception("Model class {$modelClass} not found for service {$serviceType}!");
     }
     /**
      * @var OpenSocial_Rest_Mapper_Interface $mapper
      */
     $mapper = new OpenSocial_Rest_Mapper_Json($modelClass);
     return $mapper->map($response->getBody());
 }
예제 #8
0
파일: CookieJar.php 프로젝트: reoring/sabel
 /**
  * Parse an HTTP response, adding all the cookies set in that response
  * to the cookie jar.
  *
  * @param Zend_Http_Response $response
  * @param Zend_Uri_Http|string $ref_uri Requested URI
  */
 public function addCookiesFromResponse($response, $ref_uri)
 {
     if (!$response instanceof Sabel_Http_Response) {
         $message = __METHOD__ . "() \$response is expected to be a Response object.";
         throw new Sabel_Exception_Runtime($message);
     }
     $cookie_hdrs = $response->getHeader("Set-Cookie");
     if (is_array($cookie_hdrs)) {
         foreach ($cookie_hdrs as $cookie) {
             $this->addCookie($cookie, $ref_uri);
         }
     } elseif (is_string($cookie_hdrs)) {
         $this->addCookie($cookie_hdrs, $ref_uri);
     }
 }
예제 #9
0
 /**
  * Parse an HTTP response, adding all the cookies set in that response
  * to the cookie jar.
  *
  * @param Zend_Http_Response $response
  * @param Zend_Uri_Http|string $ref_uri Requested URI
  */
 public function addCookiesFromResponse($response, $ref_uri)
 {
     $cookie_hdrs = $response->getHeader('Set-Cookie');
     if (is_array($cookie_hdrs)) {
         foreach ($cookie_hdrs as $cookie) {
             $this->addCookie($cookie, $ref_uri);
         }
     } elseif (is_string($cookie_hdrs)) {
         $this->addCookie($cookie_hdrs, $ref_uri);
     }
 }
예제 #10
0
 protected function _previewHtml($uri, Zend_Http_Response $response)
 {
     $arr_return = array();
     $body = $response->getBody();
     $body = trim($body);
     if (preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getHeader('content-type'), $matches) || preg_match('/charset=([a-zA-Z0-9-_]+)/i', $response->getBody(), $matches)) {
         $charset = trim($matches[1]);
     } else {
         $charset = 'UTF-8';
     }
     if (function_exists('mb_convert_encoding')) {
         $body = mb_convert_encoding($body, 'HTML-ENTITIES', $charset);
     }
     // Get DOM
     if (class_exists('DOMDocument')) {
         $dom = new Zend_Dom_Query($body);
     } else {
         $dom = null;
         // Maybe add b/c later
     }
     $title = null;
     if ($dom) {
         $titleList = $dom->query('title');
         if (count($titleList) > 0) {
             $title = trim($titleList->current()->textContent);
             $title = substr($title, 0, 255);
         }
     }
     $arr_return['title'] = $title;
     $description = null;
     if ($dom) {
         $descriptionList = $dom->queryXpath("//meta[@name='description']");
         // Why are they using caps? -_-
         if (count($descriptionList) == 0) {
             $descriptionList = $dom->queryXpath("//meta[@name='Description']");
         }
         if (count($descriptionList) > 0) {
             $description = trim($descriptionList->current()->getAttribute('content'));
             $description = substr($description, 0, 255);
         }
     }
     $arr_return['description'] = $description;
     $thumb = null;
     if ($dom) {
         $mediumList = $dom->queryXpath("//meta[@property='og:image']");
         if (count($mediumList) > 0) {
             $thumb = $mediumList->current()->getAttribute('content');
         }
         if (!$thumb) {
             $thumbList = $dom->queryXpath("//link[@rel='image_src']");
             if (count($thumbList) > 0) {
                 $thumb = $thumbList->current()->getAttribute('href');
             }
         }
     }
     $arr_return['thumb'] = $thumb;
     return $arr_return;
 }