/** * Crawler spider -> crawl all the links present in the sitemap file. * * @return null */ public function crawlerSpider() { try { //initialize $runStartTime = new \DateTime('now'); $crawler = new \Cx\Core_Modules\LinkManager\Model\Entity\Crawler(); $crawler->setLang($this->langId); $crawler->setStartTime($runStartTime); $crawler->setEndTime($runStartTime); $crawler->setTotalLinks(0); $crawler->setTotalBrokenLinks(0); $crawler->setRunStatus(self::RUN_STATUS_RUNNING); $this->em->persist($crawler); $this->em->flush(); //If the sitemap file not exists for the $langName then return $sitemapPath = ASCMS_DOCUMENT_ROOT . '/sitemap_' . $this->langName . '.xml'; if (!file_exists($sitemapPath)) { $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE); \DBG::log('No sitemap found for language ' . $this->langName . '. Please save a page so the sitemap can be build.'); return; } //Read the sitemap file and get all the static page urls $sitemapXml = simplexml_load_file($sitemapPath); foreach ($sitemapXml->children() as $child) { foreach ($child as $value) { if ($value->getName() !== 'loc') { continue; } $page = $this->getPageByUrl((string) $value); if (!$page || $page->getType() !== self::TYPE_CONTENT) { continue; } $this->initializeScript((string) $value, $page->getId()); if (!$this->checkMemoryLimit(self::MiB2)) { $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE); die; // memory limit exceeded } } } //move the uncalled links from link table to history table $this->moveOldLinksToHistory($runStartTime); //get the total links and total broken links $totalLinks = $this->linkRepo->getLinksCountByLang($runStartTime->format(ASCMS_DATE_FORMAT_INTERNATIONAL_DATETIME), $this->langId); $totalBrokenLinks = $this->linkRepo->brokenLinkCountByLang($this->langId); $crawler->updateEndTime(); $crawler->setTotalLinks($totalLinks); $crawler->setTotalBrokenLinks($totalBrokenLinks); $crawler->setRunStatus(self::RUN_STATUS_COMPLETED); $this->em->flush(); } catch (\Exception $error) { $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE); die('Error occurred' . $error); } }
/** * Recheck the selected links status * * @return null */ public function recheckSelectedLinks() { global $_ARRAYLANG; //Get the post values $selectedIds = isset($_POST['selected']) ? $_POST['selected'] : ''; $links = $this->linkRepository->getSelectedLinks($selectedIds); if (!$links) { $links = array(); } $pageLinks = array(); foreach ($links as $link) { $refererPath = $link->getRefererPath(); $requestPath = $link->getRequestedPath(); $subLinks = array(); $recheckPage = false; // Get the Links in the referer // Recheck the refer once (on first request of refer) if (array_key_exists($refererPath, $pageLinks)) { $subLinks = $pageLinks[$refererPath]; } else { $pageLinks[$refererPath] = $subLinks = $this->getController('LinkCrawler')->getPageLinks($refererPath); $recheckPage = true; } if ($recheckPage) { $this->recheckPage($link, $subLinks); } // Check whether the request path exists in the referer page // if not exists remove the link if (!array_key_exists($requestPath, $subLinks)) { $this->em->remove($link); } else { $urlStatus = $this->getUrlStatus($link->getRequestedPath()); $link->setLinkStatusCode($urlStatus); $link->setFlagStatus($urlStatus == 200 ? 1 : 0); $link->setLinkRecheck(true); } } //update the broken links count in crawler table foreach (\FWLanguage::getActiveFrontendLanguages() as $lang) { $lastRunByLang = $this->crawlerRepository->getLastRunByLang($lang['id']); $brokenLinkCnt = $this->linkRepository->brokenLinkCountByLang($lang['id']); if ($lastRunByLang) { $lastRunByLang->setTotalBrokenLinks($brokenLinkCnt); } } $this->em->flush(); \Message::ok($_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_SUCCESS_MSG']); }
/** * Crawler spider -> crawl all the links present in the sitemap file. * * @return null */ public function crawlerSpider() { try { //initialize $runStartTime = new \DateTime('now'); $inputArray = array('lang' => contrexx_raw2db($this->langId), 'startTime' => $runStartTime, 'endTime' => $runStartTime, 'totalLinks' => 0, 'totalBrokenLinks' => 0, 'runStatus' => contrexx_raw2db(self::RUN_STATUS_RUNNING)); $lastInsertedRunId = $this->modifyCrawler($inputArray); $request = new \HTTP_Request2(); $sitemapPath = ASCMS_DOCUMENT_ROOT . '/sitemap_' . $this->langName . '.xml'; if (file_exists($sitemapPath)) { $sitemapXml = simplexml_load_file($sitemapPath); foreach ($sitemapXml->children() as $child) { foreach ($child as $value) { if ($value->getName() == 'loc') { $page = $this->isModulePage((string) $value); if ($page && $page->getType() == self::TYPE_CONTENT) { $this->initializeScript((string) $value, $request, $page->getId()); $this->checkMemoryLimit($lastInsertedRunId); //$this->checkTimeoutLimit($lastInsertedRunId); } } } } } else { $this->updateCrawlerStatus($lastInsertedRunId, self::RUN_STATUS_INCOMPLETE); \DBG::log('No sitemap found for language ' . $this->langName . '. Please save a page so the sitemap can be build.'); return; } //move the uncalled links from link table to history table $this->updateHistory($this->langId, $lastInsertedRunId); //get the total links and total broken links $totalLinks = $this->linkRepo->getLinksCountByLang($runStartTime->format(ASCMS_DATE_FORMAT_INTERNATIONAL_DATETIME), $this->langId); $totalBrokenLinks = $this->linkRepo->brokenLinkCountByLang($this->langId); //save the run details $crawlerRuns = $this->crawlerRepo->findOneBy(array('id' => $lastInsertedRunId)); if ($crawlerRuns) { $inputArray = array('lang' => contrexx_raw2db($this->langId), 'startTime' => $runStartTime, 'totalLinks' => contrexx_raw2db($totalLinks), 'totalBrokenLinks' => contrexx_raw2db($totalBrokenLinks), 'runStatus' => contrexx_raw2db(self::RUN_STATUS_COMPLETED)); $crawlerRuns->updateEndTime(); $this->modifyCrawler($inputArray, $crawlerRuns); } } catch (\Exception $error) { $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE); die('Error occurred' . $error); } }
/** * Recheck the selected links status * * @global array $_ARRAYLANG * * @return null */ public function recheckSelectedLinks() { global $_ARRAYLANG; $selectedIds = isset($_POST['selected']) ? $_POST['selected'] : ''; $links = $this->linkRepository->getSelectedLinks($selectedIds); if (!$links) { $links = array(); } $request = new \HTTP_Request2(); $pageLinks = array(); foreach ($links as $link) { if (!in_array($link->getEntryTitle(), $pageLinks)) { $pageLinks[] = $link->getEntryTitle(); ${$link->getEntryTitle()} = array(); try { $request->setUrl($link->getRefererPath()); $request->setConfig(array('ssl_verify_peer' => false, 'ssl_verify_host' => false, 'follow_redirects' => true)); $response = $request->send(); $html = \str_get_html($response->getBody()); } catch (\Exception $e) { $html = false; } if (!$html) { continue; } else { //remove the navigation menu $objNavigation = $html->find('ul#navigation, ul.navigation', 0); $objNavigation->outertext = ''; $html = \str_get_html($html->outertext); // Find all images foreach ($html->find('img') as $element) { if (preg_match('#\\.(jpg|jpeg|gif|png)$# i', $element->src)) { $imgSrc = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->src, null); if (!empty($imgSrc)) { ${$link->getEntryTitle()}[$imgSrc] = $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_IMAGE']; } } } // Find all links foreach ($html->find('a') as $element) { $aHref = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->href, $link->getRefererPath()); if (!empty($aHref)) { $linkText = $element->plaintext ? $element->plaintext : $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK']; ${$link->getEntryTitle()}[$aHref] = $linkText; } } } } if (!array_key_exists($link->getRequestedPath(), ${$link->getEntryTitle()})) { $linkInputValues = array('lang' => $link->getLang(), 'refererPath' => $link->getRefererPath(), 'leadPath' => $link->getLeadPath(), 'entryTitle' => $link->getEntryTitle(), 'detectedTime' => $link->getDetectedTime(), 'updatedBy' => 0); $this->recheckPage(${$link->getEntryTitle()}, $linkInputValues, $request); $this->em->remove($link); } else { try { $request->setUrl($link->getRequestedPath()); $response = $request->send(); $urlStatus = $response->getStatus(); } catch (\Exception $e) { $urlStatus = 0; } if ($urlStatus == '200') { $this->em->remove($link); } else { $link->setLinkStatusCode($urlStatus); $link->setLinkRecheck(true); } } $this->em->persist($link); $this->em->flush(); } //update the broken links count in crawler table foreach (\FWLanguage::getActiveFrontendLanguages() as $lang) { $lastRunByLang = $this->crawlerRepository->getLastRunByLang($lang['id']); $brokenLinkCnt = $this->linkRepository->brokenLinkCountByLang($lang['id']); if ($lastRunByLang) { $lastRunByLang->setTotalBrokenLinks($brokenLinkCnt); $this->em->persist($lastRunByLang); } } $this->em->flush(); \Message::ok($_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_SUCCESS_MSG']); }