$controller->updateWebsite($_POST); break; } } else { switch ($_GET['sec']) { case "Activate": $controller->__changeStatus($_GET['websiteId'], 1); $controller->listWebsites($_GET); break; case "Inactivate": $controller->__changeStatus($_GET['websiteId'], 0); $controller->listWebsites($_GET); break; case "delete": $controller->__deleteWebsite($_GET['websiteId']); $controller->listWebsites($_GET); break; case "edit": $controller->editWebsite($_GET['websiteId']); break; case "new": $controller->newWebsite($_GET); break; case "crawlmeta": $controller->crawlMetaData(urldecode($_GET['url'])); break; default: $controller->listWebsites($_GET); break; } }
function getPageInfo($url, $domainUrl, $returnUrls = false) { $urlWithTrailingSlash = Spider::addTrailingSlash($url); $ret = $this->getContent($urlWithTrailingSlash); $pageInfo = array(); $checkUrl = formatUrl($domainUrl); if (!empty($ret['page'])) { $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']); $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true); $pattern = "/<a(.*?)>(.*?)<\\/a>/is"; preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER); for ($i = 0; $i < count($matches[1]); $i++) { $href = $this->__getTagParam("href", $matches[1][$i]); if (!empty($href) || !empty($matches[2][$i])) { if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) { $pageInfo['total_links'] += 1; $external = 0; if (stristr($href, 'http://') || stristr($href, 'https://')) { if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) { $external = 1; $pageInfo['external'] += 1; } } else { // if url starts with / then append with base url of site if (preg_match('/^\\//', $href)) { $href = $domainUrl . $href; } elseif ($url == $domainUrl) { $href = $domainUrl . "/" . $href; } else { $pageInfo['total_links'] -= 1; continue; } // if contains back directory operator if (stristr($href, '/../')) { $hrefParts = explode('/../', $href); preg_match('/.*\\//', $hrefParts[0], $matchpart); $href = $matchpart[0] . $hrefParts[1]; } } // if details of urls to be checked if ($returnUrls) { $linkInfo['link_url'] = $href; if (stristr($matches[2][$i], '<img')) { $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]); } else { $linkInfo['link_anchor'] = strip_tags($matches[2][$i]); } $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0; $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]); if ($external) { $pageInfo['external_links'][] = $linkInfo; } else { $pageInfo['site_links'][] = $linkInfo; } } } } } } //echo "<pre>";print_r($pageInfo);exit; return $pageInfo; }
function getPageInfo($url, $domainUrl, $returnUrls = false) { $urlWithTrailingSlash = Spider::addTrailingSlash($url); $ret = $this->getContent($urlWithTrailingSlash); $pageInfo = array(); $checkUrl = formatUrl($domainUrl); // if relative links of a page needs to be checked if (SP_RELATIVE_LINK_CRAWL) { $relativeUrl = $domainUrl . $this->getRelativeUrl($url); } // find main domain host link $domainHostInfo = parse_url($domainUrl); $domainHostLink = $domainHostInfo['scheme'] . "://" . $domainHostInfo['host'] . "/"; if (!empty($ret['page'])) { $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']); $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true); // check whether base url tag is there $baseTagUrl = ""; if (preg_match("/<base (.*?)>/is", $string, $match)) { $baseTagUrl = $this->__getTagParam("href", $match[1]); $baseTagUrl = $this->addTrailingSlash($baseTagUrl); } $pattern = "/<a(.*?)>(.*?)<\\/a>/is"; preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER); // loop through matches for ($i = 0; $i < count($matches[1]); $i++) { // check links foudn valid or not $href = $this->__getTagParam("href", $matches[1][$i]); if (!empty($href) || !empty($matches[2][$i])) { if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) { // find external links $pageInfo['total_links'] += 1; $external = 0; if (stristr($href, 'http://') || stristr($href, 'https://')) { if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) { $external = 1; $pageInfo['external'] += 1; } } else { // if url starts with / then append with base url of site if (preg_match('/^\\//', $href)) { $href = $domainHostLink . $href; } elseif (!empty($baseTagUrl)) { $href = $baseTagUrl . $href; } elseif ($url == $domainUrl) { $href = $domainUrl . "/" . $href; } elseif (SP_RELATIVE_LINK_CRAWL) { $href = $relativeUrl . "/" . $href; } else { $pageInfo['total_links'] -= 1; continue; } // if contains back directory operator if (stristr($href, '/../')) { $hrefParts = explode('/../', $href); preg_match('/.*\\//', $hrefParts[0], $matchpart); $href = $matchpart[0] . $hrefParts[1]; } } // if details of urls to be checked if ($returnUrls) { $linkInfo['link_url'] = $href; if (stristr($matches[2][$i], '<img')) { $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]); } else { $linkInfo['link_anchor'] = strip_tags($matches[2][$i]); } $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0; $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]); if ($external) { $pageInfo['external_links'][] = $linkInfo; } else { $pageInfo['site_links'][] = $linkInfo; } } } } } } return $pageInfo; }
$controller->__changeStatus($id, 0); } } $controller->listWebsites($_POST); break; case "deleteall": if (!empty($_POST['ids'])) { foreach ($_POST['ids'] as $id) { $controller->__deleteWebsite($id); } } $controller->listWebsites($_POST); break; case "crawlmeta": $keyInput = empty($_POST['keyinput']) ? "" : $_POST['keyinput']; $controller->crawlMetaData(urldecode($_POST['url']), $keyInput); break; } } else { switch ($_GET['sec']) { case "Activate": $controller->__changeStatus($_GET['websiteId'], 1); $controller->listWebsites($_GET); break; case "Inactivate": $controller->__changeStatus($_GET['websiteId'], 0); $controller->listWebsites($_GET); break; case "delete": $controller->__deleteWebsite($_GET['websiteId']); $controller->listWebsites($_GET);