Exemple #1
0
            $controller->updateWebsite($_POST);
            break;
    }
} else {
    switch ($_GET['sec']) {
        case "Activate":
            $controller->__changeStatus($_GET['websiteId'], 1);
            $controller->listWebsites($_GET);
            break;
        case "Inactivate":
            $controller->__changeStatus($_GET['websiteId'], 0);
            $controller->listWebsites($_GET);
            break;
        case "delete":
            $controller->__deleteWebsite($_GET['websiteId']);
            $controller->listWebsites($_GET);
            break;
        case "edit":
            $controller->editWebsite($_GET['websiteId']);
            break;
        case "new":
            $controller->newWebsite($_GET);
            break;
        case "crawlmeta":
            $controller->crawlMetaData(urldecode($_GET['url']));
            break;
        default:
            $controller->listWebsites($_GET);
            break;
    }
}
 function getPageInfo($url, $domainUrl, $returnUrls = false)
 {
     $urlWithTrailingSlash = Spider::addTrailingSlash($url);
     $ret = $this->getContent($urlWithTrailingSlash);
     $pageInfo = array();
     $checkUrl = formatUrl($domainUrl);
     if (!empty($ret['page'])) {
         $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']);
         $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true);
         $pattern = "/<a(.*?)>(.*?)<\\/a>/is";
         preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER);
         for ($i = 0; $i < count($matches[1]); $i++) {
             $href = $this->__getTagParam("href", $matches[1][$i]);
             if (!empty($href) || !empty($matches[2][$i])) {
                 if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) {
                     $pageInfo['total_links'] += 1;
                     $external = 0;
                     if (stristr($href, 'http://') || stristr($href, 'https://')) {
                         if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) {
                             $external = 1;
                             $pageInfo['external'] += 1;
                         }
                     } else {
                         // if url starts with / then append with base url of site
                         if (preg_match('/^\\//', $href)) {
                             $href = $domainUrl . $href;
                         } elseif ($url == $domainUrl) {
                             $href = $domainUrl . "/" . $href;
                         } else {
                             $pageInfo['total_links'] -= 1;
                             continue;
                         }
                         // if contains back directory operator
                         if (stristr($href, '/../')) {
                             $hrefParts = explode('/../', $href);
                             preg_match('/.*\\//', $hrefParts[0], $matchpart);
                             $href = $matchpart[0] . $hrefParts[1];
                         }
                     }
                     // if details of urls to be checked
                     if ($returnUrls) {
                         $linkInfo['link_url'] = $href;
                         if (stristr($matches[2][$i], '<img')) {
                             $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]);
                         } else {
                             $linkInfo['link_anchor'] = strip_tags($matches[2][$i]);
                         }
                         $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0;
                         $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]);
                         if ($external) {
                             $pageInfo['external_links'][] = $linkInfo;
                         } else {
                             $pageInfo['site_links'][] = $linkInfo;
                         }
                     }
                 }
             }
         }
     }
     //echo "<pre>";print_r($pageInfo);exit;
     return $pageInfo;
 }
 function getPageInfo($url, $domainUrl, $returnUrls = false)
 {
     $urlWithTrailingSlash = Spider::addTrailingSlash($url);
     $ret = $this->getContent($urlWithTrailingSlash);
     $pageInfo = array();
     $checkUrl = formatUrl($domainUrl);
     // if relative links of a page needs to be checked
     if (SP_RELATIVE_LINK_CRAWL) {
         $relativeUrl = $domainUrl . $this->getRelativeUrl($url);
     }
     // find main domain host link
     $domainHostInfo = parse_url($domainUrl);
     $domainHostLink = $domainHostInfo['scheme'] . "://" . $domainHostInfo['host'] . "/";
     if (!empty($ret['page'])) {
         $string = str_replace(array("\n", '\\n\\r', '\\r\\n', '\\r'), "", $ret['page']);
         $pageInfo = WebsiteController::crawlMetaData($url, '', $string, true);
         // check whether base url tag is there
         $baseTagUrl = "";
         if (preg_match("/<base (.*?)>/is", $string, $match)) {
             $baseTagUrl = $this->__getTagParam("href", $match[1]);
             $baseTagUrl = $this->addTrailingSlash($baseTagUrl);
         }
         $pattern = "/<a(.*?)>(.*?)<\\/a>/is";
         preg_match_all($pattern, $string, $matches, PREG_PATTERN_ORDER);
         // loop through matches
         for ($i = 0; $i < count($matches[1]); $i++) {
             // check links foudn valid or not
             $href = $this->__getTagParam("href", $matches[1][$i]);
             if (!empty($href) || !empty($matches[2][$i])) {
                 if (!preg_match('/mailto:/', $href) && !preg_match('/javascript:|;/', $href)) {
                     // find external links
                     $pageInfo['total_links'] += 1;
                     $external = 0;
                     if (stristr($href, 'http://') || stristr($href, 'https://')) {
                         if (!preg_match("/^" . preg_quote($checkUrl, '/') . "/", formatUrl($href))) {
                             $external = 1;
                             $pageInfo['external'] += 1;
                         }
                     } else {
                         // if url starts with / then append with base url of site
                         if (preg_match('/^\\//', $href)) {
                             $href = $domainHostLink . $href;
                         } elseif (!empty($baseTagUrl)) {
                             $href = $baseTagUrl . $href;
                         } elseif ($url == $domainUrl) {
                             $href = $domainUrl . "/" . $href;
                         } elseif (SP_RELATIVE_LINK_CRAWL) {
                             $href = $relativeUrl . "/" . $href;
                         } else {
                             $pageInfo['total_links'] -= 1;
                             continue;
                         }
                         // if contains back directory operator
                         if (stristr($href, '/../')) {
                             $hrefParts = explode('/../', $href);
                             preg_match('/.*\\//', $hrefParts[0], $matchpart);
                             $href = $matchpart[0] . $hrefParts[1];
                         }
                     }
                     // if details of urls to be checked
                     if ($returnUrls) {
                         $linkInfo['link_url'] = $href;
                         if (stristr($matches[2][$i], '<img')) {
                             $linkInfo['link_anchor'] = $this->__getTagParam("alt", $matches[2][$i]);
                         } else {
                             $linkInfo['link_anchor'] = strip_tags($matches[2][$i]);
                         }
                         $linkInfo['nofollow'] = stristr($matches[1][$i], 'nofollow') ? 1 : 0;
                         $linkInfo['link_title'] = $this->__getTagParam("title", $matches[1][$i]);
                         if ($external) {
                             $pageInfo['external_links'][] = $linkInfo;
                         } else {
                             $pageInfo['site_links'][] = $linkInfo;
                         }
                     }
                 }
             }
         }
     }
     return $pageInfo;
 }
Exemple #4
0
                    $controller->__changeStatus($id, 0);
                }
            }
            $controller->listWebsites($_POST);
            break;
        case "deleteall":
            if (!empty($_POST['ids'])) {
                foreach ($_POST['ids'] as $id) {
                    $controller->__deleteWebsite($id);
                }
            }
            $controller->listWebsites($_POST);
            break;
        case "crawlmeta":
            $keyInput = empty($_POST['keyinput']) ? "" : $_POST['keyinput'];
            $controller->crawlMetaData(urldecode($_POST['url']), $keyInput);
            break;
    }
} else {
    switch ($_GET['sec']) {
        case "Activate":
            $controller->__changeStatus($_GET['websiteId'], 1);
            $controller->listWebsites($_GET);
            break;
        case "Inactivate":
            $controller->__changeStatus($_GET['websiteId'], 0);
            $controller->listWebsites($_GET);
            break;
        case "delete":
            $controller->__deleteWebsite($_GET['websiteId']);
            $controller->listWebsites($_GET);