function crawl() { $db =& JFactory::getDBO(); $app =& JFactory::getApplication(); $start = JRequest::getInt('start', 0); $idPage = JRequest::getInt('idPage', 0); $rsseoConfig = $app->getuserState('rsseoConfig'); $autocrawler = $rsseoConfig['crawler.enable.auto']; if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '0' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } if ($start == 1) { $db->setQuery("UPDATE #__rsseo_pages SET PageCrawled=0"); $db->query(); } if ($idPage != 0) { //load the selected page $db->setQuery("SELECT * FROM #__rsseo_pages WHERE IdPage='" . $idPage . "'"); } else { //load the first un-crawled page $db->setQuery("SELECT * FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel != 127 ORDER BY PageLevel asc, IdPage asc LIMIT 1"); } $page = $db->loadObject(); if (!empty($page)) { $newPage = rsseoHelper::checkPage($page->IdPage, $idPage); $newPage->PageCrawled = 1; $newPage->DatePageCrawled = time(); if ($newPage->PageLevel < 127) { $newPage->store(); } $link = $newPage->_link; //get the ignored href`s $ignored = $rsseoConfig['crawler.ignore']; $ignored = str_replace("\r", '', $ignored); $ignored = explode("\n", $ignored); $parser = rsseoHelper::file_get_html($link); if ($rsseoConfig['crawler.level'] == -1 || $rsseoConfig['crawler.level'] != -1 && $page->PageLevel < $rsseoConfig['crawler.level']) { while ($parser->parse()) { if (strtolower($parser->iNodeName) == 'a') { $href = rsseoHelper::clean_url(@$parser->iNodeAttributes['href']); foreach ($ignored as $ignore) { if (!empty($ignore)) { $ignore = str_replace('&', '&', $ignore); if ($this->is_ignored($href, $ignore)) { continue 2; } } } if (strpos($href, 'mailto:') !== FALSE) { continue; } if (strpos($href, 'javascript:') !== FALSE) { continue; } if ($newPage->PageLevel >= 127) { continue; } if ($href == 'administrator/' || $href == 'administrator') { continue; } if ($href != null) { $href = str_replace(JURI::root(), '', $href); $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageURL='" . $href . "'"); if ($db->loadResult() == 0) { $db->setQuery("INSERT INTO #__rsseo_pages SET PageURL = '" . $href . "', PageTitle ='', PageKeywords ='', PageDescription = '', PageInSitemap = 1 , PageSitemap=0, PageCrawled=0, PageLevel = '" . ($page->PageLevel + 1) . "' "); $db->query(); } } } } } //count the number of pages crawled $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled != 0 AND PageLevel != 127"); $pages_crawled = $db->loadResult(); //count the number of pages left on this level.. $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages WHERE PageCrawled = 0 AND PageLevel='" . $page->PageLevel . "'"); $pages_left = $db->loadResult(); //count total pages crawled $db->setQuery("SELECT COUNT(*) FROM #__rsseo_pages"); $total_pages = $db->loadResult(); if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } $page_properties = array(); $page_properties[] = $newPage->PageURL; $page_properties[] = $newPage->PageLevel; $page_properties[] = $pages_crawled; $page_properties[] = $pages_left; $page_properties[] = date($rsseoConfig['global.dateformat']); $page_properties[] = $newPage->PageTitle; $page_properties[] = $total_pages; $page_properties[] = ceil($newPage->PageGrade); echo implode("\n", $page_properties); } else { if ($autocrawler) { $db->setQuery("UPDATE #__rsseo_config SET ConfigValue = '1' WHERE ConfigName = 'crawler.enable.auto' "); $db->query(); } echo 'Finished' . "\n\n\n\n\n\n\n\n\n"; } exit; }