function checkPage($idPage) { $app =& JFactory::getApplication(); $rsseoConfig = $app->getuserState('rsseoConfig'); $db =& JFactory::getDBO(); JTable::addIncludePath(JPATH_ADMINISTRATOR . DS . 'components' . DS . 'com_rsseo' . DS . 'tables'); $page =& JTable::getInstance('rsseo_pages', 'Table'); $page->load($idPage); $parser = rsseoHelper::file_get_html(JURI::root() . $page->PageURL, 0); $html = rsseoHelper::fopen(JURI::root() . $page->PageURL); $link = JURI::root() . $page->PageURL; $title = ''; $description = ''; $keywords = ''; $images = 0; $images_no_alt = 0; $images_no_hw = 0; $links = 0; preg_match('#<title>(.*?)<\\/title>#', $html, $match); preg_match_all('#<h([0-9+])(.*?)<\\/h([0-9+])>#is', $html, $matches); $title = @$match[1]; $headings = count($matches[0]); while ($parser->parse()) { if ($parser->iNodeName == 'a' && isset($parser->iNodeAttributes['href'])) { $links++; } if (strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'description') { $description = $parser->iNodeAttributes['content']; } if (strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'keywords') { $keywords = $parser->iNodeAttributes['content']; } if (empty($title) && strtolower($parser->iNodeName) == 'meta' && @$parser->iNodeAttributes['name'] == 'title') { $title = $parser->iNodeAttributes['content']; } if (strtolower($parser->iNodeName) == 'img') { $images++; } if (strtolower($parser->iNodeName) == 'img' && !isset($parser->iNodeAttributes['alt'])) { $images_no_alt++; } if (strtolower($parser->iNodeName) == 'img' && !isset($parser->iNodeAttributes['width']) && !isset($parser->iNodeAttributes['height'])) { $images_no_hw++; } } $valid_text = strtolower(strip_tags($html)); $density_keywords = $page->PageKeywordsDensity; if (!empty($density_keywords)) { $density_keywords = explode(',', $density_keywords); array_walk($density_keywords, array('rsseoHelper', 'lowercasearray')); $density_keywords = array_unique($density_keywords); $densityparams = array(); foreach ($density_keywords as $keyword) { if (empty($keyword)) { continue; } $densityparams[$keyword] = rsseoHelper::keywordDensity($valid_text, $keyword); } $registry = JRegistry::getInstance('density'); $registry->loadArray($densityparams); $page->densityparams = $registry->toString(); } else { $page->densityparams = ''; } $page->PageTitle = $title; $page->PageDescription = $description; $page->PageKeywords = $keywords; //build the params $params = array(); //check if url is sef friendly if (strpos($page->PageURL, ".php?") === FALSE) { $params['url_sef'] = 1; } else { $params['url_sef'] = 0; } //check if page title is unique $db->setQuery("SELECT COUNT(*) cnt FROM #__rsseo_pages WHERE PageTitle = '" . $db->getEscaped($page->PageTitle) . "' AND published = 1 "); $params['duplicate_title'] = $db->loadResult(); //check title length $params['title_length'] = strlen($page->PageTitle); //check if page meta description is unique $db->setQuery("SELECT COUNT(*) cnt FROM #__rsseo_pages WHERE PageDescription = '" . $db->getEscaped($page->PageDescription) . "' AND published = 1 "); $params['duplicate_desc'] = $db->loadResult(); //check description length $params['description_length'] = strlen($page->PageDescription); //check number of keywords $keyw = trim($keywords) != '' ? explode(',', $page->PageKeywords) : array(); $params['keywords'] = count($keyw); $params['headings'] = $headings; $params['images'] = $images; $params['images_wo_alt'] = $images_no_alt; $params['images_wo_hw'] = $images_no_hw; $params['links'] = $links; $reg = JRegistry::getInstance(''); $reg->loadArray($params); $page->params = $reg->toString(); //the raw html $page->_link = $link; //the page grade $grade = 0; $total = 0; if ($params['url_sef'] == 1 && $rsseoConfig['crawler.sef']) { $grade++; } if ($params['duplicate_title'] == 1 && $rsseoConfig['crawler.title.duplicate']) { $grade++; } if ($params['title_length'] >= 10 && $params['title_length'] <= 70 && $rsseoConfig['crawler.title.length']) { $grade++; } if ($params['duplicate_desc'] == 1 && $rsseoConfig['crawler.description.duplicate']) { $grade++; } if ($params['description_length'] >= 70 && $params['description_length'] <= 150 && $rsseoConfig['crawler.description.length']) { $grade++; } if ($params['keywords'] <= 10 && $rsseoConfig['crawler.keywords']) { $grade++; } if ($params['headings'] > 0 && $rsseoConfig['crawler.headings']) { $grade++; } if ($params['images'] <= 10 && $rsseoConfig['crawler.images']) { $grade++; } if ($params['images_wo_alt'] == 0 && $rsseoConfig['crawler.images.alt']) { $grade++; } if ($params['images_wo_hw'] == 0 && $rsseoConfig['crawler.images.hw']) { $grade++; } if ($params['links'] <= 100) { $grade++; } if ($rsseoConfig['crawler.sef']) { $total++; } if ($rsseoConfig['crawler.title.duplicate']) { $total++; } if ($rsseoConfig['crawler.title.length']) { $total++; } if ($rsseoConfig['crawler.description.duplicate']) { $total++; } if ($rsseoConfig['crawler.description.length']) { $total++; } if ($rsseoConfig['crawler.keywords']) { $total++; } if ($rsseoConfig['crawler.headings']) { $total++; } if ($rsseoConfig['crawler.images']) { $total++; } if ($rsseoConfig['crawler.images.alt']) { $total++; } if ($rsseoConfig['crawler.images.hw']) { $total++; } if ($rsseoConfig['crawler.intext.links']) { $total++; } $page->PageGrade = $grade * 100 / $total; return $page; }
function crawl() { $db =& JFactory::getDBO(); $app =& JFactory::getApplication(); $start = JRequest::getInt('start', 0); $idPage = JRequest::getInt('idPage', 0); $rsseoConfig = $app->getuserState('rsseoConfig'); $autocrawler = $rsseoConfig['crawler.enable.auto']; if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '0' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } if ($start == 1) { $db->setQuery("UPDATE #__rsseo_pages SET PageCrawled=0"); $db->query(); } if ($idPage != 0) { //load the selected page $db->setQuery("SELECT * FROM #__rsseo_pages WHERE IdPage='" . $idPage . "'"); } else { //load the first un-crawled page $db->setQuery("SELECT * FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel != 127 ORDER BY PageLevel asc, IdPage asc LIMIT 1"); } $page = $db->loadObject(); if (!empty($page)) { $newPage = rsseoHelper::checkPage($page->IdPage, $idPage); $newPage->PageCrawled = 1; $newPage->DatePageCrawled = time(); if ($newPage->PageLevel < 127) { $newPage->store(); } $link = $newPage->_link; //get the ignored href`s $ignored = $rsseoConfig['crawler.ignore']; $ignored = str_replace("\r", '', $ignored); $ignored = explode("\n", $ignored); $parser = rsseoHelper::file_get_html($link); if ($rsseoConfig['crawler.level'] == -1 || $rsseoConfig['crawler.level'] != -1 && $page->PageLevel < $rsseoConfig['crawler.level']) { while ($parser->parse()) { if (strtolower($parser->iNodeName) == 'a') { $href = rsseoHelper::clean_url(@$parser->iNodeAttributes['href']); foreach ($ignored as $ignore) { if (!empty($ignore)) { $ignore = str_replace('&', '&', $ignore); if ($this->is_ignored($href, $ignore)) { continue 2; } } } if (strpos($href, 'mailto:') !== FALSE) { continue; } if (strpos($href, 'javascript:') !== FALSE) { continue; } if ($newPage->PageLevel >= 127) { continue; } if ($href == 'administrator/' || $href == 'administrator') { continue; } if ($href != null) { $href = str_replace(JURI::root(), '', $href); $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageURL='" . $href . "'"); if ($db->loadResult() == 0) { $db->setQuery("INSERT INTO #__rsseo_pages SET PageURL = '" . $href . "', PageTitle ='', PageKeywords ='', PageDescription = '', PageInSitemap = 1 , PageSitemap=0, PageCrawled=0, PageLevel = '" . ($page->PageLevel + 1) . "' "); $db->query(); } } } } } //count the number of pages crawled $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled != 0 AND PageLevel != 127"); $pages_crawled = $db->loadResult(); //count the number of pages left on this level.. $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel='" . $page->PageLevel . "'"); $pages_left = $db->loadResult(); //count total pages crawled $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages"); $total_pages = $db->loadResult(); if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } $page_properties = array(); $page_properties[] = $newPage->PageURL; $page_properties[] = $newPage->PageLevel; $page_properties[] = $pages_crawled; $page_properties[] = $pages_left; $page_properties[] = date($rsseoConfig['global.dateformat']); $page_properties[] = $newPage->PageTitle; $page_properties[] = $total_pages; $page_properties[] = ceil($newPage->PageGrade); echo implode("\n", $page_properties); } else { if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } echo 'Finished' . "\n\n\n\n\n\n\n\n\n"; } exit; }
function refreshkeyword($IdKeyword = null) { $app =& JFactory::getApplication(); $db =& JFactory::getDBO(); $rsseoConfig = $app->getuserState('rsseoConfig'); if (!$IdKeyword) { $cid = intval(JRequest::getVar('cid')); } else { $cid = $IdKeyword; } $db->setQuery("SELECT ActualKeywordPosition FROM #__rsseo_keywords WHERE IdKeyword =" . $cid); $currentPosition = $db->loadResult(); $db->setQuery("SELECT Keyword FROM #__rsseo_keywords WHERE IdKeyword='" . $cid . "'"); $keyword = $db->loadResult(); $rsseoConfig['subdomains'] = str_replace("\r", '', $rsseoConfig['subdomains']); $domains = array(); $domains = explode("\n", $rsseoConfig['subdomains']); $domains[] = JURI::root(); $q = str_replace(" ", "+", $keyword); $q = str_replace("%26", "&", $q); $valid = 0; $i = 1; for ($google_page = 0; $google_page < 5; $google_page++) { $parser = rsseoHelper::file_get_html('http://www.' . $rsseoConfig['google.domain'] . '/search?q=' . $q . '&pws=0&start=' . 10 * $google_page); while ($parser->parse()) { if (strtolower($parser->iNodeName) == 'a' && @$parser->iNodeAttributes['class'] == 'l' && empty($parser->iNodeAttributes['title']) && empty($parser->iNodeAttributes['style'])) { $href = @$parser->iNodeAttributes['href']; foreach ($domains as $domain) { if (empty($domain)) { continue; } if (strpos($href, $domain) !== false) { $valid = 1; continue; } } if ($valid) { continue; } $i++; } } if ($valid) { break; } } $array['position'] = $valid ? $i : 0; $array['date_refreshed'] = time(); if ($array['position'] > $currentPosition) { $array['color'] = "colorred"; } if ($array['position'] < $currentPosition) { $array['color'] = "colorgreen"; } if ($array['position'] == $currentPosition) { $array['color'] = "colornone"; } //update last keyword $db->setQuery("UPDATE #__rsseo_keywords SET ActualKeywordPosition ='" . $array['position'] . "' , LastKeywordPosition = " . $currentPosition . " , DateRefreshed='" . $array['date_refreshed'] . "' WHERE IdKeyword = '" . $cid . "'"); $db->query(); $array['date_refreshed'] = date($rsseoConfig['global.dateformat'], $array['date_refreshed']); if ($IdKeyword == null) { echo serialize($array); exit; } }