/** * Crawling initialization script * * @global object $objInit * * @param string $url lead url * @param \HTTP_Request2 $request http_request object * @param integer $referPageId lead page id * * @return null */ public function initializeScript($url, \HTTP_Request2 $request, $referPageId) { global $objInit; $_ARRAYLANG = $objInit->loadLanguageData('LinkManager'); $refererUrlResponse = $this->checkUrlStatus($url, $request); $this->storeUrlInfos($request, $url, $url, 0, $referPageId, $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK']); if ($refererUrlResponse) { $refererUrlBody = $refererUrlResponse->getBody(); $html = \str_get_html($refererUrlBody); if ($html) { //First check the page content href and src foreach ($html->find(ASCMS_LINKMANAGER_CONTENT_HREF_QUERY) as $element) { $aHref = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->href, $url); if (!empty($aHref) && $this->isLinkExists($aHref, true)) { $linkText = $element->plaintext ? $element->plaintext : $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK']; $this->storeUrlInfos($request, $aHref, $url, 0, $referPageId, $linkText); } } foreach ($html->find(ASCMS_LINKMANAGER_CONTENT_IMG_QUERY) as $element) { if (preg_match('#\\.(jpg|jpeg|gif|png)$# i', $element->src)) { $imgSrc = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->src, null); if (!empty($imgSrc) && $this->isLinkExists($imgSrc, true)) { $this->storeUrlInfos($request, $imgSrc, $url, 1, $referPageId, $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_IMAGE']); } } } //remove the page content $objPageContent = $html->find(ASCMS_LINKMANAGER_CONTENT_PAGE_QUERY, 0); $objPageContent->outertext = ''; $html = \str_get_html($html->outertext); //remove the navigation menu $objNavigation = $html->find(ASCMS_LINKMANAGER_NAVIGATION_QUERY, 0); $objNavigation->outertext = ''; $html = \str_get_html($html->outertext); // Find all images foreach ($html->find('img') as $element) { if (preg_match('#\\.(jpg|jpeg|gif|png)$# i', $element->src)) { $imgSrc = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->src, null); if (!empty($imgSrc) && $this->isLinkExists($imgSrc)) { $this->storeUrlInfos($request, $imgSrc, $url, 1, $referPageId, $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_IMAGE']); } } } // Find all links foreach ($html->find('a') as $element) { $aHref = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->href, $url); if (!empty($aHref) && $this->isLinkExists($aHref)) { $linkText = $element->plaintext ? $element->plaintext : $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK']; $this->storeUrlInfos($request, $aHref, $url, 0, $referPageId, $linkText); } } } } else { return; } }
/** * Recheck the selected links status * * @global array $_ARRAYLANG * * @return null */ public function recheckSelectedLinks() { global $_ARRAYLANG; $selectedIds = isset($_POST['selected']) ? $_POST['selected'] : ''; $links = $this->linkRepository->getSelectedLinks($selectedIds); if (!$links) { $links = array(); } $request = new \HTTP_Request2(); $pageLinks = array(); foreach ($links as $link) { if (!in_array($link->getEntryTitle(), $pageLinks)) { $pageLinks[] = $link->getEntryTitle(); ${$link->getEntryTitle()} = array(); try { $request->setUrl($link->getRefererPath()); $request->setConfig(array('ssl_verify_peer' => false, 'ssl_verify_host' => false, 'follow_redirects' => true)); $response = $request->send(); $html = \str_get_html($response->getBody()); } catch (\Exception $e) { $html = false; } if (!$html) { continue; } else { //remove the navigation menu $objNavigation = $html->find('ul#navigation, ul.navigation', 0); $objNavigation->outertext = ''; $html = \str_get_html($html->outertext); // Find all images foreach ($html->find('img') as $element) { if (preg_match('#\\.(jpg|jpeg|gif|png)$# i', $element->src)) { $imgSrc = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->src, null); if (!empty($imgSrc)) { ${$link->getEntryTitle()}[$imgSrc] = $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_IMAGE']; } } } // Find all links foreach ($html->find('a') as $element) { $aHref = \Cx\Core_Modules\LinkManager\Controller\Url::checkPath($element->href, $link->getRefererPath()); if (!empty($aHref)) { $linkText = $element->plaintext ? $element->plaintext : $_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_NO_LINK']; ${$link->getEntryTitle()}[$aHref] = $linkText; } } } } if (!array_key_exists($link->getRequestedPath(), ${$link->getEntryTitle()})) { $linkInputValues = array('lang' => $link->getLang(), 'refererPath' => $link->getRefererPath(), 'leadPath' => $link->getLeadPath(), 'entryTitle' => $link->getEntryTitle(), 'detectedTime' => $link->getDetectedTime(), 'updatedBy' => 0); $this->recheckPage(${$link->getEntryTitle()}, $linkInputValues, $request); $this->em->remove($link); } else { try { $request->setUrl($link->getRequestedPath()); $response = $request->send(); $urlStatus = $response->getStatus(); } catch (\Exception $e) { $urlStatus = 0; } if ($urlStatus == '200') { $this->em->remove($link); } else { $link->setLinkStatusCode($urlStatus); $link->setLinkRecheck(true); } } $this->em->persist($link); $this->em->flush(); } //update the broken links count in crawler table foreach (\FWLanguage::getActiveFrontendLanguages() as $lang) { $lastRunByLang = $this->crawlerRepository->getLastRunByLang($lang['id']); $brokenLinkCnt = $this->linkRepository->brokenLinkCountByLang($lang['id']); if ($lastRunByLang) { $lastRunByLang->setTotalBrokenLinks($brokenLinkCnt); $this->em->persist($lastRunByLang); } } $this->em->flush(); \Message::ok($_ARRAYLANG['TXT_CORE_MODULE_LINKMANAGER_SUCCESS_MSG']); }