Exemplo n.º 1
0
 public function execute(Request $old_request, Request $new_request)
 {
     $old_url = parse_url($old_request->getUrl());
     $new_url = parse_url($new_request->getUrl());
     $old_robots = Xbb_RobotsTxt::getInstance($old_url["scheme"] . '://' . $old_url["host"]);
     $new_robots = Xbb_RobotsTxt::getInstance($new_url["scheme"] . '://' . $new_url["host"]);
     $this->_columns[$this->_filter->getName() . '_old'] = (int) $old_robots->allow($old_request->getUrl());
     $this->_columns[$this->_filter->getName() . '_new'] = (int) $new_robots->allow($new_request->getUrl());
     return $this->_columns;
 }
Exemplo n.º 2
0
 /**
  * Helper function to gather all the curl options: global, inferred, and per request
  *
  * @param Request $request
  * @return array
  */
 private function prepareRequestOptions(Request $request)
 {
     // options for this entire curl object
     $options = $this->getOptions();
     // set the request URL
     $options[CURLOPT_URL] = $request->getUrl();
     // set the request method
     $options[CURLOPT_CUSTOMREQUEST] = $request->getMethod();
     // posting data w/ this request?
     if ($request->getPostData()) {
         $options[CURLOPT_POST] = 1;
         $options[CURLOPT_POSTFIELDS] = $request->getPostData();
     }
     // if the request has headers, use those, or if there are global headers, use those
     if ($request->getHeaders()) {
         $options[CURLOPT_HEADER] = 0;
         $options[CURLOPT_HTTPHEADER] = $request->getHeaders();
     } elseif ($this->getHeaders()) {
         $options[CURLOPT_HEADER] = 0;
         $options[CURLOPT_HTTPHEADER] = $this->getHeaders();
     }
     // if the request has options set, use those and have them take precedence
     if ($request->getOptions()) {
         $options = $request->getOptions() + $options;
     }
     return $options;
 }
Exemplo n.º 3
0
 /**
  * Callback for curl multirequests.
  * Gets urls from response body and calls multicurl again.
  * @param Request $request curl request.
  * @return mixed.
  */
 public function visitContentUrls(Request $request)
 {
     $result = $request->getResponseInfo();
     $this->allResults[] = $result;
     if (strpos($result['url'], $this->baseUrl) === false) {
         return false;
     }
     if ($result['http_code'] >= 400) {
         return $this->errors[$request->getUrl()] = $this->responseHeader($request, 'http_code');
     } elseif ($this->pageCallback) {
         call_user_func_array($this->pageCallback, [$request->getUrl(), $request->getResponseText(), &$this->errors]);
     }
     if (is_array($this->formOptions)) {
         $this->formOptions['url'] = $request->getUrl();
         $this->formOptions['content'] = $request->getResponseText();
         if ($postData = $this->getFormTest($this->formOptions)->postData($result['url'])) {
             $this->getCurl()->multiRequest($postData, function (Request $request) {
                 $this->visitContentUrls($request);
             });
         }
     }
     if (!($urls = $this->getPageUrls($request->getResponseText(), $result['url']))) {
         return false;
     }
     $this->getCurl()->multiRequest($urls, function ($request) {
         return $this->visitContentUrls($request);
     });
 }
Exemplo n.º 4
0
 /**
  * The RollingCurl callback function
  *
  * @param  Request     $request      The request object
  * @param  RollingCurl $rolling_curl The current RollingCurl object
  * @return void
  */
 public function theCallback(Request $request, RollingCurl $rollingCurl)
 {
     // dd($request->getResponseInfo());
     $url = $request->getUrl();
     $html = $request->getResponseText();
     $httpCode = array_get($request->getResponseInfo(), 'http_code');
     // Add URL to index (or update count)
     $this->getWebCache()->add($url);
     if ($httpCode >= 200 && $httpCode < 400 && !empty($html)) {
         $matches = [];
         // Parse - Links
         $this->getParser()->parseLinks($request, $rollingCurl);
         // Parse - Emails
         // $this->parseEmails($html);
         // Garbage collect
         unset($html);
     }
     $this->crawlUrls();
     // dd($this->getWebCache()->all());
     // return $newLinks;
 }