예제 #1
0
 function getPageInfo($url, $domainUrl, $returnUrls = false)
 {
     $urlWithTrailingSlash = Spider::addTrailingSlash($url);
     $ret = $this->getContent($urlWithTrailingSlash);
     $pageInfo = array();
     $checkUrl = formatUrl($domainUrl);
     // if relative links of a page needs to be checked
     if (SP_RELATIVE_LINK_CRAWL) {
         $relativeUrl = $domainUrl . $this->getRelativeUrl($url);
     }
     // find main domain host link
     $domainHostInfo = parse_url($domainUrl);
     $domainHostLink = $domainHostInfo['scheme'] . "://" . $domainHostInfo['host'] . "/";
     if (!empty($ret['page'])) {
         $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']);
         $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true);
         // check whether base url tag is there
         $baseTagUrl = "";
         if (preg_match("/<base (.*?)>/is", $string, $match)) {
             $baseTagUrl = $this->__getTagParam("href", $match[1]);
             $baseTagUrl = $this->addTrailingSlash($baseTagUrl);
         }
         $pattern = "/<a(.*?)>(.*?)<\\/a>/is";
         preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER);
         // loop through matches
         for ($i = 0; $i < count($matches[1]); $i++) {
             // check links foudn valid or not
             $href = $this->__getTagParam("href", $matches[1][$i]);
             if (!empty($href) || !empty($matches[2][$i])) {
                 if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) {
                     // find external links
                     $pageInfo['total_links'] += 1;
                     $external = 0;
                     if (stristr($href, 'http://') || stristr($href, 'https://')) {
                         if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) {
                             $external = 1;
                             $pageInfo['external'] += 1;
                         }
                     } else {
                         // if url starts with / then append with base url of site
                         if (preg_match('/^\\//', $href)) {
                             $href = $domainHostLink . $href;
                         } elseif (!empty($baseTagUrl)) {
                             $href = $baseTagUrl . $href;
                         } elseif ($url == $domainUrl) {
                             $href = $domainUrl . "/" . $href;
                         } elseif (SP_RELATIVE_LINK_CRAWL) {
                             $href = $relativeUrl . "/" . $href;
                         } else {
                             $pageInfo['total_links'] -= 1;
                             continue;
                         }
                         // if contains back directory operator
                         if (stristr($href, '/../')) {
                             $hrefParts = explode('/../', $href);
                             preg_match('/.*\\//', $hrefParts[0], $matchpart);
                             $href = $matchpart[0] . $hrefParts[1];
                         }
                     }
                     // if details of urls to be checked
                     if ($returnUrls) {
                         $linkInfo['link_url'] = $href;
                         if (stristr($matches[2][$i], '<img')) {
                             $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]);
                         } else {
                             $linkInfo['link_anchor'] = strip_tags($matches[2][$i]);
                         }
                         $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0;
                         $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]);
                         if ($external) {
                             $pageInfo['external_links'][] = $linkInfo;
                         } else {
                             $pageInfo['site_links'][] = $linkInfo;
                         }
                     }
                 }
             }
         }
     }
     return $pageInfo;
 }
예제 #2
0
 function runReport($reportUrl, $projectInfo, $totalLinks)
 {
     $spider = new Spider();
     $pageInfo = $spider->getPageInfo($reportUrl, $projectInfo['url'], true);
     if ($rInfo = $this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$reportUrl}'")) {
         $reportInfo['id'] = $rInfo['id'];
         $reportInfo['page_title'] = addslashes($pageInfo['page_title']);
         $reportInfo['page_description'] = addslashes($pageInfo['page_description']);
         $reportInfo['page_keywords'] = addslashes($pageInfo['page_keywords']);
         $reportInfo['total_links'] = $pageInfo['total_links'];
         $reportInfo['external_links'] = $pageInfo['external'];
         $reportInfo['crawled'] = 1;
         // gooogle pagerank check
         if ($projectInfo['check_pr']) {
             $rankCtrler = $this->createController('Rank');
             $reportInfo['pagerank'] = $rankCtrler->__getGooglePageRank(Spider::addTrailingSlash($reportUrl));
         }
         // backlinks page check
         if ($projectInfo['check_backlinks']) {
             $backlinkCtrler = $this->createController('Backlink');
             $backlinkCtrler->url = Spider::addTrailingSlash($reportUrl);
             $reportInfo['bing_backlinks'] = $backlinkCtrler->__getBacklinks('msn');
             $reportInfo['google_backlinks'] = $backlinkCtrler->__getBacklinks('google');
         }
         // indexed page check
         if ($projectInfo['check_indexed']) {
             $saturationCtrler = $this->createController('SaturationChecker');
             $saturationCtrler->url = Spider::addTrailingSlash($reportUrl);
             $reportInfo['bing_indexed'] = $saturationCtrler->__getSaturationRank('msn');
             $reportInfo['google_indexed'] = $saturationCtrler->__getSaturationRank('google');
         }
         if ($projectInfo['check_brocken']) {
             $reportInfo['brocken'] = Spider::isLInkBrocken($linkInfo['link_url']);
         }
         $this->saveReportInfo($reportInfo, 'update');
         // to store sitelinks in page and links reports
         $i = 0;
         if (count($pageInfo['site_links']) > 0) {
             // loo through site links
             foreach ($pageInfo['site_links'] as $linkInfo) {
                 // if store links
                 if ($projectInfo['store_links_in_page']) {
                     $delete = $i++ ? false : true;
                     $linkInfo['report_id'] = $rInfo['id'];
                     $this->storePagelLinks($linkInfo, $delete);
                 }
                 // if total links saved less than max links allowed for a project
                 if ($totalLinks < $projectInfo['max_links']) {
                     // check whether valid html serving link
                     if (preg_match('/\\.zip$|\\.gz$|\\.tar$|\\.png$|\\.jpg$|\\.jpeg$|\\.gif$|\\.mp3$|\\.flv$|\\.pdf$|\\.m4a$|#$/i', $linkInfo['link_url'])) {
                         continue;
                     }
                     // if found any space in the link
                     $linkInfo['link_url'] = Spider::formatUrl($linkInfo['link_url']);
                     if (!preg_match('/\\S+/', $linkInfo['link_url'])) {
                         continue;
                     }
                     // check whether url needs to be excluded
                     if ($this->isExcludeLink($linkInfo['link_url'], $projectInfo['exclude_links'])) {
                         continue;
                     }
                     // save links for the project report
                     if (!$this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$linkInfo['link_url']}'")) {
                         $repInfo['page_url'] = $linkInfo['link_url'];
                         $repInfo['project_id'] = $projectInfo['id'];
                         $this->saveReportInfo($repInfo);
                         $totalLinks++;
                     }
                 }
             }
         }
         // to store external links in page
         if ($projectInfo['store_links_in_page']) {
             if (count($pageInfo['external_links']) > 0) {
                 foreach ($pageInfo['external_links'] as $linkInfo) {
                     $delete = $i++ ? false : true;
                     $linkInfo['report_id'] = $rInfo['id'];
                     $linkInfo['extrenal'] = 1;
                     $this->storePagelLinks($linkInfo, $delete);
                 }
             }
         }
         // calculate score of each page and update it
         $this->updateReportPageScore($rInfo['id']);
         // calculate score of each page and update it
         $this->updateProjectPageScore($projectInfo['id']);
     }
 }
예제 #3
0
 function getPageInfo($url, $domainUrl, $returnUrls = false)
 {
     $urlWithTrailingSlash = Spider::addTrailingSlash($url);
     $ret = $this->getContent($urlWithTrailingSlash);
     $pageInfo = array();
     $checkUrl = formatUrl($domainUrl);
     if (!empty($ret['page'])) {
         $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']);
         $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true);
         $pattern = "/<a(.*?)>(.*?)<\\/a>/is";
         preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER);
         for ($i = 0; $i < count($matches[1]); $i++) {
             $href = $this->__getTagParam("href", $matches[1][$i]);
             if (!empty($href) || !empty($matches[2][$i])) {
                 if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) {
                     $pageInfo['total_links'] += 1;
                     $external = 0;
                     if (stristr($href, 'http://') || stristr($href, 'https://')) {
                         if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) {
                             $external = 1;
                             $pageInfo['external'] += 1;
                         }
                     } else {
                         // if url starts with / then append with base url of site
                         if (preg_match('/^\\//', $href)) {
                             $href = $domainUrl . $href;
                         } elseif ($url == $domainUrl) {
                             $href = $domainUrl . "/" . $href;
                         } else {
                             $pageInfo['total_links'] -= 1;
                             continue;
                         }
                         // if contains back directory operator
                         if (stristr($href, '/../')) {
                             $hrefParts = explode('/../', $href);
                             preg_match('/.*\\//', $hrefParts[0], $matchpart);
                             $href = $matchpart[0] . $hrefParts[1];
                         }
                     }
                     // if details of urls to be checked
                     if ($returnUrls) {
                         $linkInfo['link_url'] = $href;
                         if (stristr($matches[2][$i], '<img')) {
                             $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]);
                         } else {
                             $linkInfo['link_anchor'] = strip_tags($matches[2][$i]);
                         }
                         $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0;
                         $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]);
                         if ($external) {
                             $pageInfo['external_links'][] = $linkInfo;
                         } else {
                             $pageInfo['site_links'][] = $linkInfo;
                         }
                     }
                 }
             }
         }
     }
     //echo "<pre>";print_r($pageInfo);exit;
     return $pageInfo;
 }
예제 #4
0
 function generateSitemapFile($sitemapInfo)
 {
     $sitemapInfo['project_id'] = intval($sitemapInfo['project_id']);
     if (!empty($sitemapInfo['project_id'])) {
         # check whether the sitemap directory is writable
         if (!is_writable(SP_TMPPATH . "/" . $this->sitemapDir)) {
             hideDiv('message');
             showErrorMsg("Directory '<b>" . SP_TMPPATH . "/" . $this->sitemapDir . "</b>' is not <b>writable</b>. Please change its <b>permission</b> !");
         }
         $saCtrler = $this->createController('SiteAuditor');
         $projectInfo = $saCtrler->__getProjectInfo($sitemapInfo['project_id']);
         $this->section = formatFileName($projectInfo['name']);
         $this->smType = $sitemapInfo['sm_type'];
         $this->excludeUrl = $sitemapInfo['exclude_url'];
         if (!empty($sitemapInfo['freq'])) {
             $this->changefreq = $sitemapInfo['freq'];
         }
         if (!empty($sitemapInfo['priority'])) {
             $this->priority = $sitemapInfo['priority'];
         }
         $auditorComp = $this->createComponent('AuditorComponent');
         $pageList = $auditorComp->getAllreportPages(" and project_id=" . $sitemapInfo['project_id']);
         $urlList = array();
         foreach ($pageList as $pageInfo) {
             $pageInfo['page_url'] = Spider::addTrailingSlash($pageInfo['page_url']);
             if ($auditorComp->isExcludeLink($pageInfo['page_url'], trim($sitemapInfo['exclude_url']))) {
                 continue;
             }
             $urlList[] = $pageInfo['page_url'];
         }
         $this->createSitemap($this->smType, $urlList);
     } else {
         hideDiv('message');
         showErrorMsg("No Website Found!");
     }
 }