/** * Add and edit the crawler details * * @param array $inputArray * @param \Cx\Core_Modules\LinkManager\Model\Entity\Crawler $crawler * * @return integer */ public function modifyCrawler(array $inputArray = array(), $crawler = '') { try { if (empty($inputArray)) { return; } if (empty($crawler)) { $crawler = new \Cx\Core_Modules\LinkManager\Model\Entity\Crawler(); } $crawler->updateFromArray($inputArray); $this->em->persist($crawler); $this->em->flush(); return $crawler->getId(); } catch (\Exception $e) { $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE); die('Crawler Query ERROR!' . $e); } }
/** * Crawler spider -> crawl all the links present in the sitemap file. * * @return null */ public function crawlerSpider() { try { //initialize $runStartTime = new \DateTime('now'); $crawler = new \Cx\Core_Modules\LinkManager\Model\Entity\Crawler(); $crawler->setLang($this->langId); $crawler->setStartTime($runStartTime); $crawler->setEndTime($runStartTime); $crawler->setTotalLinks(0); $crawler->setTotalBrokenLinks(0); $crawler->setRunStatus(self::RUN_STATUS_RUNNING); $this->em->persist($crawler); $this->em->flush(); //If the sitemap file not exists for the $langName then return $sitemapPath = ASCMS_DOCUMENT_ROOT . '/sitemap_' . $this->langName . '.xml'; if (!file_exists($sitemapPath)) { $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE); \DBG::log('No sitemap found for language ' . $this->langName . '. Please save a page so the sitemap can be build.'); return; } //Read the sitemap file and get all the static page urls $sitemapXml = simplexml_load_file($sitemapPath); foreach ($sitemapXml->children() as $child) { foreach ($child as $value) { if ($value->getName() !== 'loc') { continue; } $page = $this->getPageByUrl((string) $value); if (!$page || $page->getType() !== self::TYPE_CONTENT) { continue; } $this->initializeScript((string) $value, $page->getId()); if (!$this->checkMemoryLimit(self::MiB2)) { $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE); die; // memory limit exceeded } } } //move the uncalled links from link table to history table $this->moveOldLinksToHistory($runStartTime); //get the total links and total broken links $totalLinks = $this->linkRepo->getLinksCountByLang($runStartTime->format(ASCMS_DATE_FORMAT_INTERNATIONAL_DATETIME), $this->langId); $totalBrokenLinks = $this->linkRepo->brokenLinkCountByLang($this->langId); $crawler->updateEndTime(); $crawler->setTotalLinks($totalLinks); $crawler->setTotalBrokenLinks($totalBrokenLinks); $crawler->setRunStatus(self::RUN_STATUS_COMPLETED); $this->em->flush(); } catch (\Exception $error) { $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE); die('Error occurred' . $error); } }