function getPageInfo($url, $domainUrl, $returnUrls = false) { $urlWithTrailingSlash = Spider::addTrailingSlash($url); $ret = $this->getContent($urlWithTrailingSlash); $pageInfo = array(); $checkUrl = formatUrl($domainUrl); // if relative links of a page needs to be checked if (SP_RELATIVE_LINK_CRAWL) { $relativeUrl = $domainUrl . $this->getRelativeUrl($url); } // find main domain host link $domainHostInfo = parse_url($domainUrl); $domainHostLink = $domainHostInfo['scheme'] . "://" . $domainHostInfo['host'] . "/"; if (!empty($ret['page'])) { $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']); $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true); // check whether base url tag is there $baseTagUrl = ""; if (preg_match("/<base (.*?)>/is", $string, $match)) { $baseTagUrl = $this->__getTagParam("href", $match[1]); $baseTagUrl = $this->addTrailingSlash($baseTagUrl); } $pattern = "/<a(.*?)>(.*?)<\\/a>/is"; preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER); // loop through matches for ($i = 0; $i < count($matches[1]); $i++) { // check links foudn valid or not $href = $this->__getTagParam("href", $matches[1][$i]); if (!empty($href) || !empty($matches[2][$i])) { if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) { // find external links $pageInfo['total_links'] += 1; $external = 0; if (stristr($href, 'http://') || stristr($href, 'https://')) { if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) { $external = 1; $pageInfo['external'] += 1; } } else { // if url starts with / then append with base url of site if (preg_match('/^\\//', $href)) { $href = $domainHostLink . $href; } elseif (!empty($baseTagUrl)) { $href = $baseTagUrl . $href; } elseif ($url == $domainUrl) { $href = $domainUrl . "/" . $href; } elseif (SP_RELATIVE_LINK_CRAWL) { $href = $relativeUrl . "/" . $href; } else { $pageInfo['total_links'] -= 1; continue; } // if contains back directory operator if (stristr($href, '/../')) { $hrefParts = explode('/../', $href); preg_match('/.*\\//', $hrefParts[0], $matchpart); $href = $matchpart[0] . $hrefParts[1]; } } // if details of urls to be checked if ($returnUrls) { $linkInfo['link_url'] = $href; if (stristr($matches[2][$i], '<img')) { $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]); } else { $linkInfo['link_anchor'] = strip_tags($matches[2][$i]); } $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0; $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]); if ($external) { $pageInfo['external_links'][] = $linkInfo; } else { $pageInfo['site_links'][] = $linkInfo; } } } } } } return $pageInfo; }
function runReport($reportUrl, $projectInfo, $totalLinks) { $spider = new Spider(); $pageInfo = $spider->getPageInfo($reportUrl, $projectInfo['url'], true); if ($rInfo = $this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$reportUrl}'")) { $reportInfo['id'] = $rInfo['id']; $reportInfo['page_title'] = addslashes($pageInfo['page_title']); $reportInfo['page_description'] = addslashes($pageInfo['page_description']); $reportInfo['page_keywords'] = addslashes($pageInfo['page_keywords']); $reportInfo['total_links'] = $pageInfo['total_links']; $reportInfo['external_links'] = $pageInfo['external']; $reportInfo['crawled'] = 1; // gooogle pagerank check if ($projectInfo['check_pr']) { $rankCtrler = $this->createController('Rank'); $reportInfo['pagerank'] = $rankCtrler->__getGooglePageRank(Spider::addTrailingSlash($reportUrl)); } // backlinks page check if ($projectInfo['check_backlinks']) { $backlinkCtrler = $this->createController('Backlink'); $backlinkCtrler->url = Spider::addTrailingSlash($reportUrl); $reportInfo['bing_backlinks'] = $backlinkCtrler->__getBacklinks('msn'); $reportInfo['google_backlinks'] = $backlinkCtrler->__getBacklinks('google'); } // indexed page check if ($projectInfo['check_indexed']) { $saturationCtrler = $this->createController('SaturationChecker'); $saturationCtrler->url = Spider::addTrailingSlash($reportUrl); $reportInfo['bing_indexed'] = $saturationCtrler->__getSaturationRank('msn'); $reportInfo['google_indexed'] = $saturationCtrler->__getSaturationRank('google'); } if ($projectInfo['check_brocken']) { $reportInfo['brocken'] = Spider::isLInkBrocken($linkInfo['link_url']); } $this->saveReportInfo($reportInfo, 'update'); // to store sitelinks in page and links reports $i = 0; if (count($pageInfo['site_links']) > 0) { // loo through site links foreach ($pageInfo['site_links'] as $linkInfo) { // if store links if ($projectInfo['store_links_in_page']) { $delete = $i++ ? false : true; $linkInfo['report_id'] = $rInfo['id']; $this->storePagelLinks($linkInfo, $delete); } // if total links saved less than max links allowed for a project if ($totalLinks < $projectInfo['max_links']) { // check whether valid html serving link if (preg_match('/\\.zip$|\\.gz$|\\.tar$|\\.png$|\\.jpg$|\\.jpeg$|\\.gif$|\\.mp3$|\\.flv$|\\.pdf$|\\.m4a$|#$/i', $linkInfo['link_url'])) { continue; } // if found any space in the link $linkInfo['link_url'] = Spider::formatUrl($linkInfo['link_url']); if (!preg_match('/\\S+/', $linkInfo['link_url'])) { continue; } // check whether url needs to be excluded if ($this->isExcludeLink($linkInfo['link_url'], $projectInfo['exclude_links'])) { continue; } // save links for the project report if (!$this->getReportInfo(" and project_id={$projectInfo['id']} and page_url='{$linkInfo['link_url']}'")) { $repInfo['page_url'] = $linkInfo['link_url']; $repInfo['project_id'] = $projectInfo['id']; $this->saveReportInfo($repInfo); $totalLinks++; } } } } // to store external links in page if ($projectInfo['store_links_in_page']) { if (count($pageInfo['external_links']) > 0) { foreach ($pageInfo['external_links'] as $linkInfo) { $delete = $i++ ? false : true; $linkInfo['report_id'] = $rInfo['id']; $linkInfo['extrenal'] = 1; $this->storePagelLinks($linkInfo, $delete); } } } // calculate score of each page and update it $this->updateReportPageScore($rInfo['id']); // calculate score of each page and update it $this->updateProjectPageScore($projectInfo['id']); } }
function getPageInfo($url, $domainUrl, $returnUrls = false) { $urlWithTrailingSlash = Spider::addTrailingSlash($url); $ret = $this->getContent($urlWithTrailingSlash); $pageInfo = array(); $checkUrl = formatUrl($domainUrl); if (!empty($ret['page'])) { $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']); $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true); $pattern = "/<a(.*?)>(.*?)<\\/a>/is"; preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER); for ($i = 0; $i < count($matches[1]); $i++) { $href = $this->__getTagParam("href", $matches[1][$i]); if (!empty($href) || !empty($matches[2][$i])) { if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) { $pageInfo['total_links'] += 1; $external = 0; if (stristr($href, 'http://') || stristr($href, 'https://')) { if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) { $external = 1; $pageInfo['external'] += 1; } } else { // if url starts with / then append with base url of site if (preg_match('/^\\//', $href)) { $href = $domainUrl . $href; } elseif ($url == $domainUrl) { $href = $domainUrl . "/" . $href; } else { $pageInfo['total_links'] -= 1; continue; } // if contains back directory operator if (stristr($href, '/../')) { $hrefParts = explode('/../', $href); preg_match('/.*\\//', $hrefParts[0], $matchpart); $href = $matchpart[0] . $hrefParts[1]; } } // if details of urls to be checked if ($returnUrls) { $linkInfo['link_url'] = $href; if (stristr($matches[2][$i], '<img')) { $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]); } else { $linkInfo['link_anchor'] = strip_tags($matches[2][$i]); } $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0; $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]); if ($external) { $pageInfo['external_links'][] = $linkInfo; } else { $pageInfo['site_links'][] = $linkInfo; } } } } } } //echo "<pre>";print_r($pageInfo);exit; return $pageInfo; }
function generateSitemapFile($sitemapInfo) { $sitemapInfo['project_id'] = intval($sitemapInfo['project_id']); if (!empty($sitemapInfo['project_id'])) { # check whether the sitemap directory is writable if (!is_writable(SP_TMPPATH . "/" . $this->sitemapDir)) { hideDiv('message'); showErrorMsg("Directory '<b>" . SP_TMPPATH . "/" . $this->sitemapDir . "</b>' is not <b>writable</b>. Please change its <b>permission</b> !"); } $saCtrler = $this->createController('SiteAuditor'); $projectInfo = $saCtrler->__getProjectInfo($sitemapInfo['project_id']); $this->section = formatFileName($projectInfo['name']); $this->smType = $sitemapInfo['sm_type']; $this->excludeUrl = $sitemapInfo['exclude_url']; if (!empty($sitemapInfo['freq'])) { $this->changefreq = $sitemapInfo['freq']; } if (!empty($sitemapInfo['priority'])) { $this->priority = $sitemapInfo['priority']; } $auditorComp = $this->createComponent('AuditorComponent'); $pageList = $auditorComp->getAllreportPages(" and project_id=" . $sitemapInfo['project_id']); $urlList = array(); foreach ($pageList as $pageInfo) { $pageInfo['page_url'] = Spider::addTrailingSlash($pageInfo['page_url']); if ($auditorComp->isExcludeLink($pageInfo['page_url'], trim($sitemapInfo['exclude_url']))) { continue; } $urlList[] = $pageInfo['page_url']; } $this->createSitemap($this->smType, $urlList); } else { hideDiv('message'); showErrorMsg("No Website Found!"); } }