Пример #1
0
 function checkPage($idPage)
 {
     $app =& JFactory::getApplication();
     $rsseoConfig = $app->getuserState('rsseoConfig');
     $db =& JFactory::getDBO();
     JTable::addIncludePath(JPATH_ADMINISTRATOR . DS . 'components' . DS . 'com_rsseo' . DS . 'tables');
     $page =& JTable::getInstance('rsseo_pages', 'Table');
     $page->load($idPage);
     $parser = rsseoHelper::file_get_html(JURI::root() . $page->PageURL, 0);
     $html = rsseoHelper::fopen(JURI::root() . $page->PageURL);
     $link = JURI::root() . $page->PageURL;
     $title = '';
     $description = '';
     $keywords = '';
     $images = 0;
     $images_no_alt = 0;
     $images_no_hw = 0;
     $links = 0;
     preg_match('#<title>(.*?)<\\/title>#', $html, $match);
     preg_match_all('#<h([0-9+])(.*?)<\\/h([0-9+])>#is', $html, $matches);
     $title = @$match[1];
     $headings = count($matches[0]);
     while ($parser->parse()) {
         if ($parser->iNodeName == 'a' && isset($parser->iNodeAttributes['href'])) {
             $links++;
         }
         if (strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'description') {
             $description = $parser->iNodeAttributes['content'];
         }
         if (strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'keywords') {
             $keywords = $parser->iNodeAttributes['content'];
         }
         if (empty($title) && strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'title') {
             $title = $parser->iNodeAttributes['content'];
         }
         if (strtolower($parser->iNodeName) == 'img') {
             $images++;
         }
         if (strtolower($parser->iNodeName) == 'img' && !isset($parser->iNodeAttributes['alt'])) {
             $images_no_alt++;
         }
         if (strtolower($parser->iNodeName) == 'img' && !isset($parser->iNodeAttributes['width']) && !isset($parser->iNodeAttributes['height'])) {
             $images_no_hw++;
         }
     }
     $valid_text = strtolower(strip_tags($html));
     $density_keywords = $page->PageKeywordsDensity;
     if (!empty($density_keywords)) {
         $density_keywords = explode(',', $density_keywords);
         array_walk($density_keywords, array('rsseoHelper', 'lowercasearray'));
         $density_keywords = array_unique($density_keywords);
         $densityparams = array();
         foreach ($density_keywords as $keyword) {
             if (empty($keyword)) {
                 continue;
             }
             $densityparams[$keyword] = rsseoHelper::keywordDensity($valid_text, $keyword);
         }
         $registry = JRegistry::getInstance('density');
         $registry->loadArray($densityparams);
         $page->densityparams = $registry->toString();
     } else {
         $page->densityparams = '';
     }
     $page->PageTitle = $title;
     $page->PageDescription = $description;
     $page->PageKeywords = $keywords;
     //build the params
     $params = array();
     //check if url is sef friendly
     if (strpos($page->PageURL, ".php?") === FALSE) {
         $params['url_sef'] = 1;
     } else {
         $params['url_sef'] = 0;
     }
     //check if page title is unique
     $db->setQuery("SELECT COUNT(*) cnt FROM #__rsseo_pages WHERE PageTitle = '" . $db->getEscaped($page->PageTitle) . "' AND published = 1 ");
     $params['duplicate_title'] = $db->loadResult();
     //check title length
     $params['title_length'] = strlen($page->PageTitle);
     //check if page meta description is unique
     $db->setQuery("SELECT COUNT(*) cnt FROM #__rsseo_pages WHERE PageDescription = '" . $db->getEscaped($page->PageDescription) . "' AND published = 1 ");
     $params['duplicate_desc'] = $db->loadResult();
     //check description length
     $params['description_length'] = strlen($page->PageDescription);
     //check number of keywords
     $keyw = trim($keywords) != '' ? explode(',', $page->PageKeywords) : array();
     $params['keywords'] = count($keyw);
     $params['headings'] = $headings;
     $params['images'] = $images;
     $params['images_wo_alt'] = $images_no_alt;
     $params['images_wo_hw'] = $images_no_hw;
     $params['links'] = $links;
     $reg = JRegistry::getInstance('');
     $reg->loadArray($params);
     $page->params = $reg->toString();
     //the raw html
     $page->_link = $link;
     //the page grade
     $grade = 0;
     $total = 0;
     if ($params['url_sef'] == 1 && $rsseoConfig['crawler.sef']) {
         $grade++;
     }
     if ($params['duplicate_title'] == 1 && $rsseoConfig['crawler.title.duplicate']) {
         $grade++;
     }
     if ($params['title_length'] >= 10 && $params['title_length'] <= 70 && $rsseoConfig['crawler.title.length']) {
         $grade++;
     }
     if ($params['duplicate_desc'] == 1 && $rsseoConfig['crawler.description.duplicate']) {
         $grade++;
     }
     if ($params['description_length'] >= 70 && $params['description_length'] <= 150 && $rsseoConfig['crawler.description.length']) {
         $grade++;
     }
     if ($params['keywords'] <= 10 && $rsseoConfig['crawler.keywords']) {
         $grade++;
     }
     if ($params['headings'] > 0 && $rsseoConfig['crawler.headings']) {
         $grade++;
     }
     if ($params['images'] <= 10 && $rsseoConfig['crawler.images']) {
         $grade++;
     }
     if ($params['images_wo_alt'] == 0 && $rsseoConfig['crawler.images.alt']) {
         $grade++;
     }
     if ($params['images_wo_hw'] == 0 && $rsseoConfig['crawler.images.hw']) {
         $grade++;
     }
     if ($params['links'] <= 100) {
         $grade++;
     }
     if ($rsseoConfig['crawler.sef']) {
         $total++;
     }
     if ($rsseoConfig['crawler.title.duplicate']) {
         $total++;
     }
     if ($rsseoConfig['crawler.title.length']) {
         $total++;
     }
     if ($rsseoConfig['crawler.description.duplicate']) {
         $total++;
     }
     if ($rsseoConfig['crawler.description.length']) {
         $total++;
     }
     if ($rsseoConfig['crawler.keywords']) {
         $total++;
     }
     if ($rsseoConfig['crawler.headings']) {
         $total++;
     }
     if ($rsseoConfig['crawler.images']) {
         $total++;
     }
     if ($rsseoConfig['crawler.images.alt']) {
         $total++;
     }
     if ($rsseoConfig['crawler.images.hw']) {
         $total++;
     }
     if ($rsseoConfig['crawler.intext.links']) {
         $total++;
     }
     $page->PageGrade = $grade * 100 / $total;
     return $page;
 }
Пример #2
0
 function crawl()
 {
     $db =& JFactory::getDBO();
     $app =& JFactory::getApplication();
     $start = JRequest::getInt('start', 0);
     $idPage = JRequest::getInt('idPage', 0);
     $rsseoConfig = $app->getuserState('rsseoConfig');
     $autocrawler = $rsseoConfig['crawler.enable.auto'];
     if ($autocrawler) {
         $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '0' WHERE ConfigName = 'crawler.enable.auto' ");
         $db->query();
     }
     if ($start == 1) {
         $db->setQuery("UPDATE #__rsseo_pages SET PageCrawled=0");
         $db->query();
     }
     if ($idPage != 0) {
         //load the selected page
         $db->setQuery("SELECT * FROM #__rsseo_pages WHERE IdPage='" . $idPage . "'");
     } else {
         //load the first un-crawled page
         $db->setQuery("SELECT * FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel != 127 ORDER BY PageLevel asc, IdPage asc LIMIT 1");
     }
     $page = $db->loadObject();
     if (!empty($page)) {
         $newPage = rsseoHelper::checkPage($page->IdPage, $idPage);
         $newPage->PageCrawled = 1;
         $newPage->DatePageCrawled = time();
         if ($newPage->PageLevel < 127) {
             $newPage->store();
         }
         $link = $newPage->_link;
         //get the ignored href`s
         $ignored = $rsseoConfig['crawler.ignore'];
         $ignored = str_replace("\r", '', $ignored);
         $ignored = explode("\n", $ignored);
         $parser = rsseoHelper::file_get_html($link);
         if ($rsseoConfig['crawler.level'] == -1 || $rsseoConfig['crawler.level'] != -1 && $page->PageLevel < $rsseoConfig['crawler.level']) {
             while ($parser->parse()) {
                 if (strtolower($parser->iNodeName) == 'a') {
                     $href = rsseoHelper::clean_url(@$parser->iNodeAttributes['href']);
                     foreach ($ignored as $ignore) {
                         if (!empty($ignore)) {
                             $ignore = str_replace('&', '&amp;', $ignore);
                             if ($this->is_ignored($href, $ignore)) {
                                 continue 2;
                             }
                         }
                     }
                     if (strpos($href, 'mailto:') !== FALSE) {
                         continue;
                     }
                     if (strpos($href, 'javascript:') !== FALSE) {
                         continue;
                     }
                     if ($newPage->PageLevel >= 127) {
                         continue;
                     }
                     if ($href == 'administrator/' || $href == 'administrator') {
                         continue;
                     }
                     if ($href != null) {
                         $href = str_replace(JURI::root(), '', $href);
                         $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageURL='" . $href . "'");
                         if ($db->loadResult() == 0) {
                             $db->setQuery("INSERT INTO #__rsseo_pages SET PageURL = '" . $href . "', PageTitle ='', PageKeywords ='', PageDescription = '', PageInSitemap = 1 , PageSitemap=0, PageCrawled=0, PageLevel = '" . ($page->PageLevel + 1) . "' ");
                             $db->query();
                         }
                     }
                 }
             }
         }
         //count the number of pages crawled
         $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled != 0 AND PageLevel != 127");
         $pages_crawled = $db->loadResult();
         //count the number of pages left on this level..
         $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel='" . $page->PageLevel . "'");
         $pages_left = $db->loadResult();
         //count total pages crawled
         $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages");
         $total_pages = $db->loadResult();
         if ($autocrawler) {
             $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' ");
             $db->query();
         }
         $page_properties = array();
         $page_properties[] = $newPage->PageURL;
         $page_properties[] = $newPage->PageLevel;
         $page_properties[] = $pages_crawled;
         $page_properties[] = $pages_left;
         $page_properties[] = date($rsseoConfig['global.dateformat']);
         $page_properties[] = $newPage->PageTitle;
         $page_properties[] = $total_pages;
         $page_properties[] = ceil($newPage->PageGrade);
         echo implode("\n", $page_properties);
     } else {
         if ($autocrawler) {
             $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' ");
             $db->query();
         }
         echo 'Finished' . "\n\n\n\n\n\n\n\n\n";
     }
     exit;
 }
Пример #3
0
 function refreshkeyword($IdKeyword = null)
 {
     $app =& JFactory::getApplication();
     $db =& JFactory::getDBO();
     $rsseoConfig = $app->getuserState('rsseoConfig');
     if (!$IdKeyword) {
         $cid = intval(JRequest::getVar('cid'));
     } else {
         $cid = $IdKeyword;
     }
     $db->setQuery("SELECT ActualKeywordPosition FROM #__rsseo_keywords WHERE IdKeyword =" . $cid);
     $currentPosition = $db->loadResult();
     $db->setQuery("SELECT Keyword FROM #__rsseo_keywords WHERE IdKeyword='" . $cid . "'");
     $keyword = $db->loadResult();
     $rsseoConfig['subdomains'] = str_replace("\r", '', $rsseoConfig['subdomains']);
     $domains = array();
     $domains = explode("\n", $rsseoConfig['subdomains']);
     $domains[] = JURI::root();
     $q = str_replace(" ", "+", $keyword);
     $q = str_replace("%26", "&", $q);
     $valid = 0;
     $i = 1;
     for ($google_page = 0; $google_page < 5; $google_page++) {
         $parser = rsseoHelper::file_get_html('http://www.' . $rsseoConfig['google.domain'] . '/search?q=' . $q . '&pws=0&start=' . 10 * $google_page);
         while ($parser->parse()) {
             if (strtolower($parser->iNodeName) == 'a' && @$parser->iNodeAttributes['class'] == 'l' && empty($parser->iNodeAttributes['title']) && empty($parser->iNodeAttributes['style'])) {
                 $href = @$parser->iNodeAttributes['href'];
                 foreach ($domains as $domain) {
                     if (empty($domain)) {
                         continue;
                     }
                     if (strpos($href, $domain) !== false) {
                         $valid = 1;
                         continue;
                     }
                 }
                 if ($valid) {
                     continue;
                 }
                 $i++;
             }
         }
         if ($valid) {
             break;
         }
     }
     $array['position'] = $valid ? $i : 0;
     $array['date_refreshed'] = time();
     if ($array['position'] > $currentPosition) {
         $array['color'] = "colorred";
     }
     if ($array['position'] < $currentPosition) {
         $array['color'] = "colorgreen";
     }
     if ($array['position'] == $currentPosition) {
         $array['color'] = "colornone";
     }
     //update last keyword
     $db->setQuery("UPDATE #__rsseo_keywords SET ActualKeywordPosition ='" . $array['position'] . "' , LastKeywordPosition = " . $currentPosition . " , DateRefreshed='" . $array['date_refreshed'] . "' WHERE IdKeyword = '" . $cid . "'");
     $db->query();
     $array['date_refreshed'] = date($rsseoConfig['global.dateformat'], $array['date_refreshed']);
     if ($IdKeyword == null) {
         echo serialize($array);
         exit;
     }
 }