/** * The RollingCurl callback function * * @param Request $request The request object * @param RollingCurl $rolling_curl The current RollingCurl object * @return void */ public function callback(Request $request, RollingCurl $rollingCurl) { $this->getIndex()->cacheUnsetRequest($request->getUrl()); $httpCode = array_get($request->getResponseInfo(), 'http_code', false); $this->getIndex()->add($request->getUrl(), ['last_http_code' => $httpCode]); if ($httpCode == 200) { $newLinks = $this->getParser()->parseHtml($request); if (is_array($newLinks) && count($newLinks) > 0) { $this->addRequests($newLinks); } } // Logging $this->logCallback($request, $httpCode); // Garbage collect unset($request, $httpCode, $newLinks); $this->clearCompleted(); $this->prunePendingRequestQueue(); $this->crawlUrls(); }
/** * Process the returned HTML with our parsers * * @param Request $request * @param RollingCurl $rolling_curl * @return void */ public function parseHtml(Request $request) { $html = $request->getResponseText(); $url = $request->getUrl(); $httpCode = array_get($request->getResponseInfo(), 'http_code', false); // For checking if $rollingCurl is keeping the same instance // $rollingCurl->log('<span style="color:#ccc;"><strong>Code:</strong> ' . $httpCode . ' <strong>URL:</strong> #' . $rollingCurl->countCompleted() . ' - ' . $request->getUrl() . '</span><br />'); $newLinks = []; if ($httpCode >= 200 and $httpCode < 400 and !empty($html)) { // Parse - Links $newLinks = $this->parseNewLinks($html); // $this->parseNewLinks($html); // Parse - Emails $this->getEmails()->run($html); // Garbage collect unset($html, $url, $httpCode); // if (is_array($newLinks) && count($newLinks) > 0) { // // dd($newLinks); // $rollingCurl->addRequests($newLinks); // } } return $newLinks; }