コード例 #1
0
 /**
  * Crawler spider -> crawl all the links present in the sitemap file.
  *
  * @return null
  */
 public function crawlerSpider()
 {
     try {
         //initialize
         $runStartTime = new \DateTime('now');
         $crawler = new \Cx\Core_Modules\LinkManager\Model\Entity\Crawler();
         $crawler->setLang($this->langId);
         $crawler->setStartTime($runStartTime);
         $crawler->setEndTime($runStartTime);
         $crawler->setTotalLinks(0);
         $crawler->setTotalBrokenLinks(0);
         $crawler->setRunStatus(self::RUN_STATUS_RUNNING);
         $this->em->persist($crawler);
         $this->em->flush();
         //If the sitemap file not exists for the $langName then return
         $sitemapPath = ASCMS_DOCUMENT_ROOT . '/sitemap_' . $this->langName . '.xml';
         if (!file_exists($sitemapPath)) {
             $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE);
             \DBG::log('No sitemap found for language ' . $this->langName . '. Please save a page so the sitemap can be build.');
             return;
         }
         //Read the sitemap file and get all the static page urls
         $sitemapXml = simplexml_load_file($sitemapPath);
         foreach ($sitemapXml->children() as $child) {
             foreach ($child as $value) {
                 if ($value->getName() !== 'loc') {
                     continue;
                 }
                 $page = $this->getPageByUrl((string) $value);
                 if (!$page || $page->getType() !== self::TYPE_CONTENT) {
                     continue;
                 }
                 $this->initializeScript((string) $value, $page->getId());
                 if (!$this->checkMemoryLimit(self::MiB2)) {
                     $this->updateCrawlerStatus($crawler, self::RUN_STATUS_INCOMPLETE);
                     die;
                     // memory limit exceeded
                 }
             }
         }
         //move the uncalled links from link table to history table
         $this->moveOldLinksToHistory($runStartTime);
         //get the total links and total broken links
         $totalLinks = $this->linkRepo->getLinksCountByLang($runStartTime->format(ASCMS_DATE_FORMAT_INTERNATIONAL_DATETIME), $this->langId);
         $totalBrokenLinks = $this->linkRepo->brokenLinkCountByLang($this->langId);
         $crawler->updateEndTime();
         $crawler->setTotalLinks($totalLinks);
         $crawler->setTotalBrokenLinks($totalBrokenLinks);
         $crawler->setRunStatus(self::RUN_STATUS_COMPLETED);
         $this->em->flush();
     } catch (\Exception $error) {
         $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE);
         die('Error occurred' . $error);
     }
 }
コード例 #2
0
 /**
  * Recheck the selected links status
  * 
  * @return null
  */
 public function recheckSelectedLinks()
 {
     global $_ARRAYLANG;
     //Get the post values
     $selectedIds = isset($_POST['selected']) ? $_POST['selected'] : '';
     $links = $this->linkRepository->getSelectedLinks($selectedIds);
     if (!$links) {
         $links = array();
     }
     $pageLinks = array();
     foreach ($links as $link) {
         $refererPath = $link->getRefererPath();
         $requestPath = $link->getRequestedPath();
         $subLinks = array();
         $recheckPage = false;
         // Get the Links in the referer
         // Recheck the refer once (on first request of refer)
         if (array_key_exists($refererPath, $pageLinks)) {
             $subLinks = $pageLinks[$refererPath];
         } else {
             $pageLinks[$refererPath] = $subLinks = $this->getController('LinkCrawler')->getPageLinks($refererPath);
             $recheckPage = true;
         }
         if ($recheckPage) {
             $this->recheckPage($link, $subLinks);
         }
         // Check whether the request path exists in the referer page
         // if not exists remove the link
         if (!array_key_exists($requestPath, $subLinks)) {
             $this->em->remove($link);
         } else {
             $urlStatus = $this->getUrlStatus($link->getRequestedPath());
             $link->setLinkStatusCode($urlStatus);
             $link->setFlagStatus($urlStatus == 200 ? 1 : 0);
             $link->setLinkRecheck(true);
         }
     }
     //update the broken links count in crawler table
     foreach (\FWLanguage::getActiveFrontendLanguages() as $lang) {
         $lastRunByLang = $this->crawlerRepository->getLastRunByLang($lang['id']);
         $brokenLinkCnt = $this->linkRepository->brokenLinkCountByLang($lang['id']);
         if ($lastRunByLang) {
             $lastRunByLang->setTotalBrokenLinks($brokenLinkCnt);
         }
     }
     $this->em->flush();
     \Message::ok($_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_SUCCESS_MSG']);
 }
コード例 #3
0
 /**
  * Crawler spider -> crawl all the links present in the sitemap file.
  * 
  * @return null
  */
 public function crawlerSpider()
 {
     try {
         //initialize
         $runStartTime = new \DateTime('now');
         $inputArray = array('lang' => contrexx_raw2db($this->langId), 'startTime' => $runStartTime, 'endTime' => $runStartTime, 'totalLinks' => 0, 'totalBrokenLinks' => 0, 'runStatus' => contrexx_raw2db(self::RUN_STATUS_RUNNING));
         $lastInsertedRunId = $this->modifyCrawler($inputArray);
         $request = new \HTTP_Request2();
         $sitemapPath = ASCMS_DOCUMENT_ROOT . '/sitemap_' . $this->langName . '.xml';
         if (file_exists($sitemapPath)) {
             $sitemapXml = simplexml_load_file($sitemapPath);
             foreach ($sitemapXml->children() as $child) {
                 foreach ($child as $value) {
                     if ($value->getName() == 'loc') {
                         $page = $this->isModulePage((string) $value);
                         if ($page && $page->getType() == self::TYPE_CONTENT) {
                             $this->initializeScript((string) $value, $request, $page->getId());
                             $this->checkMemoryLimit($lastInsertedRunId);
                             //$this->checkTimeoutLimit($lastInsertedRunId);
                         }
                     }
                 }
             }
         } else {
             $this->updateCrawlerStatus($lastInsertedRunId, self::RUN_STATUS_INCOMPLETE);
             \DBG::log('No sitemap found for language ' . $this->langName . '. Please save a page so the sitemap can be build.');
             return;
         }
         //move the uncalled links from link table to history table
         $this->updateHistory($this->langId, $lastInsertedRunId);
         //get the total links and total broken links
         $totalLinks = $this->linkRepo->getLinksCountByLang($runStartTime->format(ASCMS_DATE_FORMAT_INTERNATIONAL_DATETIME), $this->langId);
         $totalBrokenLinks = $this->linkRepo->brokenLinkCountByLang($this->langId);
         //save the run details
         $crawlerRuns = $this->crawlerRepo->findOneBy(array('id' => $lastInsertedRunId));
         if ($crawlerRuns) {
             $inputArray = array('lang' => contrexx_raw2db($this->langId), 'startTime' => $runStartTime, 'totalLinks' => contrexx_raw2db($totalLinks), 'totalBrokenLinks' => contrexx_raw2db($totalBrokenLinks), 'runStatus' => contrexx_raw2db(self::RUN_STATUS_COMPLETED));
             $crawlerRuns->updateEndTime();
             $this->modifyCrawler($inputArray, $crawlerRuns);
         }
     } catch (\Exception $error) {
         $this->updateCrawlerStatus('', self::RUN_STATUS_INCOMPLETE);
         die('Error occurred' . $error);
     }
 }
コード例 #4
0
 /**
  * Recheck the selected links status
  * 
  * @global array $_ARRAYLANG
  * 
  * @return null
  */
 public function recheckSelectedLinks()
 {
     global $_ARRAYLANG;
     $selectedIds = isset($_POST['selected']) ? $_POST['selected'] : '';
     $links = $this->linkRepository->getSelectedLinks($selectedIds);
     if (!$links) {
         $links = array();
     }
     $request = new \HTTP_Request2();
     $pageLinks = array();
     foreach ($links as $link) {
         if (!in_array($link->getEntryTitle(), $pageLinks)) {
             $pageLinks[] = $link->getEntryTitle();
             ${$link->getEntryTitle()} = array();
             try {
                 $request->setUrl($link->getRefererPath());
                 $request->setConfig(array('ssl_verify_peer' => false, 'ssl_verify_host' => false, 'follow_redirects' => true));
                 $response = $request->send();
                 $html = \str_get_html($response->getBody());
             } catch (\Exception $e) {
                 $html = false;
             }
             if (!$html) {
                 continue;
             } else {
                 //remove the navigation menu
                 $objNavigation = $html->find('ul#navigation, ul.navigation', 0);
                 $objNavigation->outertext = '';
                 $html = \str_get_html($html->outertext);
                 // Find all images
                 foreach ($html->find('img') as $element) {
                     if (preg_match('#\\.(jpg|jpeg|gif|png)$# i', $element->src)) {
                         $imgSrc = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->src, null);
                         if (!empty($imgSrc)) {
                             ${$link->getEntryTitle()}[$imgSrc] = $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_IMAGE'];
                         }
                     }
                 }
                 // Find all links
                 foreach ($html->find('a') as $element) {
                     $aHref = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->href, $link->getRefererPath());
                     if (!empty($aHref)) {
                         $linkText = $element->plaintext ? $element->plaintext : $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK'];
                         ${$link->getEntryTitle()}[$aHref] = $linkText;
                     }
                 }
             }
         }
         if (!array_key_exists($link->getRequestedPath(), ${$link->getEntryTitle()})) {
             $linkInputValues = array('lang' => $link->getLang(), 'refererPath' => $link->getRefererPath(), 'leadPath' => $link->getLeadPath(), 'entryTitle' => $link->getEntryTitle(), 'detectedTime' => $link->getDetectedTime(), 'updatedBy' => 0);
             $this->recheckPage(${$link->getEntryTitle()}, $linkInputValues, $request);
             $this->em->remove($link);
         } else {
             try {
                 $request->setUrl($link->getRequestedPath());
                 $response = $request->send();
                 $urlStatus = $response->getStatus();
             } catch (\Exception $e) {
                 $urlStatus = 0;
             }
             if ($urlStatus == '200') {
                 $this->em->remove($link);
             } else {
                 $link->setLinkStatusCode($urlStatus);
                 $link->setLinkRecheck(true);
             }
         }
         $this->em->persist($link);
         $this->em->flush();
     }
     //update the broken links count in crawler table
     foreach (\FWLanguage::getActiveFrontendLanguages() as $lang) {
         $lastRunByLang = $this->crawlerRepository->getLastRunByLang($lang['id']);
         $brokenLinkCnt = $this->linkRepository->brokenLinkCountByLang($lang['id']);
         if ($lastRunByLang) {
             $lastRunByLang->setTotalBrokenLinks($brokenLinkCnt);
             $this->em->persist($lastRunByLang);
         }
     }
     $this->em->flush();
     \Message::ok($_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_SUCCESS_MSG']);
 }