public function execute(Request $old_request, Request $new_request) { $old_url = parse_url($old_request->getUrl()); $new_url = parse_url($new_request->getUrl()); $old_robots = Xbb_RobotsTxt::getInstance($old_url["scheme"] . '://' . $old_url["host"]); $new_robots = Xbb_RobotsTxt::getInstance($new_url["scheme"] . '://' . $new_url["host"]); $this->_columns[$this->_filter->getName() . '_old'] = (int) $old_robots->allow($old_request->getUrl()); $this->_columns[$this->_filter->getName() . '_new'] = (int) $new_robots->allow($new_request->getUrl()); return $this->_columns; }
/** * Helper function to gather all the curl options: global, inferred, and per request * * @param Request $request * @return array */ private function prepareRequestOptions(Request $request) { // options for this entire curl object $options = $this->getOptions(); // set the request URL $options[CURLOPT_URL] = $request->getUrl(); // set the request method $options[CURLOPT_CUSTOMREQUEST] = $request->getMethod(); // posting data w/ this request? if ($request->getPostData()) { $options[CURLOPT_POST] = 1; $options[CURLOPT_POSTFIELDS] = $request->getPostData(); } // if the request has headers, use those, or if there are global headers, use those if ($request->getHeaders()) { $options[CURLOPT_HEADER] = 0; $options[CURLOPT_HTTPHEADER] = $request->getHeaders(); } elseif ($this->getHeaders()) { $options[CURLOPT_HEADER] = 0; $options[CURLOPT_HTTPHEADER] = $this->getHeaders(); } // if the request has options set, use those and have them take precedence if ($request->getOptions()) { $options = $request->getOptions() + $options; } return $options; }
/** * Callback for curl multirequests. * Gets urls from response body and calls multicurl again. * @param Request $request curl request. * @return mixed. */ public function visitContentUrls(Request $request) { $result = $request->getResponseInfo(); $this->allResults[] = $result; if (strpos($result['url'], $this->baseUrl) === false) { return false; } if ($result['http_code'] >= 400) { return $this->errors[$request->getUrl()] = $this->responseHeader($request, 'http_code'); } elseif ($this->pageCallback) { call_user_func_array($this->pageCallback, [$request->getUrl(), $request->getResponseText(), &$this->errors]); } if (is_array($this->formOptions)) { $this->formOptions['url'] = $request->getUrl(); $this->formOptions['content'] = $request->getResponseText(); if ($postData = $this->getFormTest($this->formOptions)->postData($result['url'])) { $this->getCurl()->multiRequest($postData, function (Request $request) { $this->visitContentUrls($request); }); } } if (!($urls = $this->getPageUrls($request->getResponseText(), $result['url']))) { return false; } $this->getCurl()->multiRequest($urls, function ($request) { return $this->visitContentUrls($request); }); }
/** * The RollingCurl callback function * * @param Request $request The request object * @param RollingCurl $rolling_curl The current RollingCurl object * @return void */ public function theCallback(Request $request, RollingCurl $rollingCurl) { // dd($request->getResponseInfo()); $url = $request->getUrl(); $html = $request->getResponseText(); $httpCode = array_get($request->getResponseInfo(), 'http_code'); // Add URL to index (or update count) $this->getWebCache()->add($url); if ($httpCode >= 200 && $httpCode < 400 && !empty($html)) { $matches = []; // Parse - Links $this->getParser()->parseLinks($request, $rollingCurl); // Parse - Emails // $this->parseEmails($html); // Garbage collect unset($html); } $this->crawlUrls(); // dd($this->getWebCache()->all()); // return $newLinks; }