/** * Initiates a crawler-process */ protected function initCrawlerProcess() { // Create working directory $this->createWorkingDirectory(); // Setup url-cache if ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE) { $this->LinkCache = new PHPCrawlerSQLiteURLCache($this->working_directory . "urlcache.db3", true); } else { $this->LinkCache = new PHPCrawlerMemoryURLCache(); } // Setup cookie-cache (use SQLite-cache if crawler runs multi-processed) if ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE) { $this->CookieCache = new PHPCrawlerSQLiteCookieCache($this->working_directory . "cookiecache.db3", true); } else { $this->CookieCache = new PHPCrawlerMemoryCookieCache(); } // ProcessCommunication $this->ProcessCommunication = new PHPCrawlerProcessCommunication($this->crawler_uniqid, $this->multiprocess_mode, $this->working_directory); // DocumentInfo-Queue if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { $this->DocumentInfoQueue = new PHPCrawlerDocumentInfoQueue($this->working_directory . "doc_queue.db3", true); } // Set tmp-file for PageRequest $this->PageRequest->setTmpFile($this->working_directory . "phpcrawl_" . getmypid() . ".tmp"); // Pass url-priorities to link-cache $this->LinkCache->addLinkPriorities($this->link_priority_array); // Pass base-URL to the UrlFilter $this->UrlFilter->setBaseURL($this->starting_url); // Add the starting-URL to the url-cache $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($this->starting_url)); }
/** * Initiates a crawler-process */ protected function initCrawlerProcess() { // Create working directory $this->createWorkingDirectory(); // Setup url-cache if ($this->SQLiteAvailable && $this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE) { $this->LinkCache = new PHPCrawlerSQLiteURLCache($this->working_directory . "urlcache.db3", true); } elseif ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_D7) { $this->LinkCache = new PHPCrawlerD7URLCache($this->database_url_cache_table, $this->crawler_uniqid); } else { $this->LinkCache = new PHPCrawlerMemoryURLCache(); } // Perge/cleanup SQLite-urlcache for resumed crawling-processes (only ONCE!) if (($this->SQLiteAvailable && $this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE || $this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_D7) && $this->urlcache_purged == false) { $this->LinkCache->purgeCache(); $this->urlcache_purged = true; } // Setup cookie-cache (use SQLite-cache if crawler runs multi-processed) if ($this->SQLiteAvailable && $this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE) { $this->CookieCache = new PHPCrawlerSQLiteCookieCache($this->working_directory . "cookiecache.db3", true); } elseif ($this->url_cache_type == PHPCrawlerUrlCacheTypes::URLCACHE_D7) { $this->CookieCache = new PHPCrawlerD7CookieCache($this->database_cookie_cache_table, $this->crawler_uniqid); } else { $this->CookieCache = new PHPCrawlerMemoryCookieCache(); } // ProcessHandler $this->ProcessHandler = new PHPCrawlerProcessHandler($this->crawler_uniqid, $this->working_directory); // Setup PHPCrawlerStatusHandler $this->CrawlerStatusHandler = new PHPCrawlerStatusHandler($this->crawler_uniqid, $this->working_directory); $this->setupCrawlerStatusHandler(); // DocumentInfo-Queue if ($this->multiprocess_mode == PHPCrawlerMultiProcessModes::MPMODE_PARENT_EXECUTES_USERCODE) { $this->DocumentInfoQueue = new PHPCrawlerDocumentInfoQueue($this->working_directory . "doc_queue.db3", true); } // Set tmp-file for PageRequest $this->PageRequest->setTmpFile($this->working_directory . "phpcrawl_" . getmypid() . ".tmp"); // Pass url-priorities to link-cache $this->LinkCache->addLinkPriorities($this->link_priority_array); // Pass base-URL to the UrlFilter $this->UrlFilter->setBaseURL($this->starting_url); // Add the starting-URL to the url-cache $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($this->starting_url)); }