/** * Lets spider the site * @param string $spider_type Set to either spider the site or just one page for testing */ public function spider_site($spider_type) { print '<div id="esi_spider">'; global $wpdb; $table = $wpdb->prefix . self::$db_table; if ($wpdb->get_var('SHOW TABLES LIKE \'' . $table . '\' ') != $table) { print '<div class="error"><p>The results database table(' . $table . ') does not exist attempting to recreate it</p></div>'; $this->activate(); } $error = ''; if ($this->domain == '') { $error = '<p>The main domain name is not set so can not spider this site. Enter a Website URL within the \'Crawling and Scraping settings\' tab</p>'; } if (get_option('mainHTMLBlock') == '') { $error .= '<p>The main HTML Block is not set so can locate the part of the pages that you which to scrape in. Select a main HTML block from the drop down list within the \'Crawling and Scraping settings\' tab</p>'; } $depth = get_option('scrapeDepth'); if ($depth == '' || $depth == '0') { $error .= '<p>The max depth setting is not set correctly. Set this to a number such as 5 within the \'Crawling and Scraping settings\' tab</p>'; } if ($error != '') { print '<h2><span class="fa fa-ambulance"></span> Unable to spider the site as this is not configured correctly</h2><div class="error">' . $error . '</div>'; return; } if ($spider_type == 'test') { print '<h2><span class="fa fa-question-circle"></span> Test spider for ' . $this->domain . $this->URL . '</h2>'; $depth = 1; $limit = 1; } else { $depth = get_option('scrapeDepth'); print '<h2><span class="fa fa-cloud-download"></span> Spider Site ' . $this->domain . $this->URL . ' with a max depth ' . $depth . '</h2> <p>This page will spider and scrape the site and add all the results to a database</p>'; $limit = $this->limit; } print '<div id="loading"><p id="loadingp">Spidering<br/>pages</p></div>'; print '<script>document.getElementById(\'loading\').style.display = \'block\';</script>'; flush(); // $exclude_pages=array('', '/', $URL, '/latedeals/', '/beach-and-seaside-holidays/', '/villas-with-pools/', '/holiday-cottages/', '/short-breaks/'); // $exclude_crawl_pattern=array('/^http/i', '/^www/i', '/^\/content/i', '/^\/category/i', '/^\/blog/i', '/^\/travel_guide/i', '/^\/resources/i', '/^\/null/i', '/^void/i', '/^\/short-breaks/i', '/[a-zA-Z0-9_]\z/i', '/yn.1\/\z/i'); // $spider->exclude_crawl_pages($exclude_pages); // $spider->exclude_crawl_pattern($exclude_crawl_pattern); // $spider->ignore_HTML_start_depth(array(1 => 'Holiday Rentals by Town')); // $spider->ignore_HTML_end_depth(array(1 => 'class=\'footer\'')); $remove_element = array('form'); $strip_element = array(''); if (get_option('stripDiv') == '1') { $strip_element[] = 'div'; } if (get_option('stripSpan') == '1') { $strip_element[] = 'span'; } $strip_attributes = array(); if (get_option('stripCSS') == '1') { $strip_attributes[] = 'style'; } if (get_option('stripClass') == '1') { $strip_attributes[] = 'class'; } $params = array('mainHTMLBlock' => get_option('mainHTMLBlock'), 'startHTML' => get_option('startHTML'), 'endHTML' => get_option('endHTML'), 'includeStart' => get_option('includeStart'), 'includeEnd' => get_option('includeEnd'), 'removeElements' => $remove_element, 'stripElements' => $strip_element, 'stripAttributes' => $strip_attributes, 'replaceDomain' => get_option('replaceDomain'), 'importLocal' => get_option('importLocal'), 'importRemote' => get_option('importRemote')); $spider = new spider($this->domain, $this->URL, get_home_path() . '/wp-content/plugins/site-importer/log', FALSE); $spider->max_depth($depth); $spider->set_limit($limit); $spider->scrape_params($params); $spider->crawl($this->domain . $this->URL); $amount = count($spider->output, COUNT_RECURSIVE) - count($spider->output); // print_r($spider->output); // die(); print '<script>document.getElementById(\'loadingp\').innerHTML = "Analysing Pages";</script>'; flush(); $urlData = $spider->formatted_page_info($spider->output); if ($spider_type == 'test') { if (count($spider->formatted_output) > 0) { array_rand($spider->formatted_output, 1); foreach ($spider->formatted_output as $page) { $import_row = ''; $info_row = ''; foreach ($page as $key => $meta) { if ($key == 'Images') { $import_row .= '<tr><th>' . $key . '</th><td>' . count($meta) . ' images</td></tr>'; } elseif ($key != 'URL' && $key != 'Category' && $key != 'Scrape' && $key != 'Filter' && $key != 'Formatted') { if ($key == 'Title' || $key == 'description') { $import_row .= '<tr><th>' . $key . '</th><td>' . $meta . '</td></tr>'; } else { $info_row .= '<tr><th>' . $key . '</th><td>' . $meta . '</td></tr>'; } } } print '<h3>Page info for ' . $page['URL'] . '</h3>'; print '<table class="white">'; print '<tr><th colspan="2" class="grey">Page Info</th></tr>' . $info_row; print '<tr><th colspan="2" class="grey">Import Data</th></tr>' . $import_row; print '<tr><th colspan="2" class="grey">Key</th></tr>'; print '<th class="red" width="150">red</th><td>items will be filtered out when imported into the site</td></tr>'; print '<th class="green">Green</th><td>item have been amended from the original</td></tr>'; print '</table>'; } print '<h3>Scraped HTML sample</h2><div id="formatted_html">' . $page['Formatted'] . '</div>'; } else { print '<div class="error"><p>Unable to display details for page ' . $page['URL'] . '- no results(' . count($spider->formatted_output) . ')</p></div>'; } } else { print '<h3>Spidering results</h3>'; print 'Found <b>' . $amount . '</b> URLS from(' . $this->domain . $this->URL . ')'; if (count($spider->formatted_output) != $amount) { print ' but only <b>' . count($spider->formatted_output) . '</b> had the type of structure to be imported'; print '(' . htmlentities(get_option('startHTML')) . ') ' . "\n<br />"; } $wpdb->query('TRUNCATE ' . $wpdb->prefix . self::$db_table); $correct_pages = ''; $post_type = get_option('postType'); foreach ($spider->formatted_output as $page_URL => $URL) { $keywords = ''; $description = ''; $title = ''; $images = ''; if (isset($URL['keywords'])) { $keywords = $URL['keywords']; } if (isset($URL['description'])) { $description = $URL['description']; } if (isset($URL['Title'])) { $title = $URL['Title']; } if (isset($URL['Images'])) { $images = serialize($URL['Images']); } $post_name = $this->get_post_name($URL['URL']); $SQL = 'INSERT INTO `' . $wpdb->prefix . self::$db_table . '` '; $SQL .= '(`url`, `category`, `title`, `description`, `keywords`, `pre_text`, `post_text`, `display_text`, `images`, `post_name`, `post_type`) '; $SQL .= ' VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )'; // print $SQL."\n<br />"; $wpdb->query($wpdb->prepare($SQL, array($URL['URL'], $URL['Category'], $title, $description, $keywords, $URL['Scrape'], $URL['Filter'], $URL['Formatted'], $images, $post_name, $post_type))); if ($URL['URL'] == '') { $URL['URL'] = $this->domain . $URL['URL']; } $correct_pages .= 'Found <a href="' . $this->domain . $URL['URL'] . '" target="_blank">' . $URL['URL'] . '</a><br />'; } print '<h3>Spidering details</h3><p></p>'; $no_valid_html = ''; $no_html_block = ''; foreach ($spider->error as $message) { if ($message['type'] == 'error') { if ($message['page'] == '') { $message['page'] = $this->domain . $message['page']; } if ($message['errno'] == '1') { $no_valid_html .= 'Not Valid Page - <a href="' . $this->domain . $message['page'] . '" target="_blank">' . $message['page'] . '</a><br />'; } elseif ($message['errno'] == '2') { $no_html_block .= 'No HTML block <a href="' . $this->domain . $message['page'] . '" target="_blank">' . $message['page'] . '</a><br />'; } else { // print 'ERROR ('.$message['errno'].') - '.$message['error'].' <a href="'.$this->domain.$message['page'].'" target="_blank">'.$message['page'].'</a><br />'; } } if ($message['type'] == 'warning') { // print 'Warning '.$message['error']."(".$message['page'].")<br/>"; } } if ($no_valid_html != '') { print '<p><b>The following pages are not valid</b></p>' . $no_valid_html; } if ($no_html_block != '') { print '<p><b>The following pages have no main HTML block</b></p>' . $no_html_block; } if ($correct_pages != '') { print '<p><b>The following pages are correct</b></p>' . $correct_pages; } print '<br />Finished spidering site and you can <a href="?page=site_importer&tab=review">review and import the results here</a>'; print '<script>var tab=document.getElementById(\'esitab_review\'); tab.style.pointerEvents = \'auto\'; tab.style.color= \'#555555\'</script>'; } print '<script>document.getElementById(\'loading\').style.display = \'none\';</script>'; print '</div>'; }