/** * Starts the loop of a child-process. */ protected function startChildProcessLoop() { $this->initCrawlerProcess(); // Call overidable method initChildProcess() $this->initChildProcess(); // Start benchmark (if single-processed) if ($this->is_chlid_process == false) { PHPCrawlerBenchmark::start("crawling_process"); } // Init vars $stop_crawling = false; // Main-Loop while ($stop_crawling == false) { // Get next URL from cache $UrlDescriptor = $this->LinkCache->getNextUrl(); // die('startChildProcessLoop $UrlDescriptor'); // Process URL if ($UrlDescriptor != null) { $stop_crawling = $this->processUrl($UrlDescriptor); // die('startChildProcessLoop processUrl'); } else { // die('startChildProcessLoop sleep'); usleep(500000); } if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { // If there's nothing more to do if ($this->LinkCache->containsURLs() == false) { $stop_crawling = true; $this->CrawlerStatusHandler->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH); } // Check for abort form other processes if ($this->checkForAbort() !== null) { $stop_crawling = true; } } } // Loop enden gere. If child-process -> kill it if ($this->is_chlid_process == true) { // die('startChildProcessLoop KILL'); if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { return; } else { exit; } } $this->crawlerStatus = $this->CrawlerStatusHandler->getCrawlerStatus(); // Cleanup crawler $this->cleanup(); // Stop benchmark (if single-processed) if ($this->is_chlid_process == false) { // die('startChildProcessLoop STOP'); PHPCrawlerBenchmark::stop("crawling_process"); } // die('startChildProcessLoop'); }
/** * Starts the loop of the controller-process (main-process). */ protected function startControllerProcessLoop() { // If multiprocess-mode is not MPMODE_PARENT_EXECUTES_USERCODE -> exit process if ($this->multiprocess_mode != PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { exit; } $this->initCrawlerProcess(); $this->initChildProcess(); while (true) { // Check for abort if ($this->checkForAbort() !== null) { $this->ProcessHandler->killChildProcesses(); break; } // Get next DocInfo-object from queue $DocInfo = $this->DocumentInfoQueue->getNextDocumentInfo(); if ($DocInfo == null) { // If there are nor more links in cache AND there are no more DocInfo-objects in queue -> passedthrough if ($this->LinkCache->containsURLs() == false && $this->DocumentInfoQueue->getDocumentInfoCount() == 0) { $this->CrawlerStatusHandler->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH); } usleep(100000); continue; } // Update crawler-status $this->CrawlerStatusHandler->updateCrawlerStatus($DocInfo); // Call the "abstract" method handlePageData $user_abort = false; // If defined by user -> call old handlePageData-method, otherwise don't (because of high memory-usage) if (method_exists($this, "handlePageData")) { $page_info = $DocInfo->toArray(); $user_return_value = $this->handlePageData($page_info); if ($user_return_value < 0) { $user_abort = true; } } // Call the "abstract" method handleDocumentInfo $user_return_value = $this->handleDocumentInfo($DocInfo); if ($user_return_value < 0) { $user_abort = true; } // Update status if user aborted process if ($user_abort == true) { $this->CrawlerStatusHandler->updateCrawlerStatus(null, PHPCrawlerAbortReasons::ABORTREASON_USERABORT); } } }