PHP spider::formatted_page_info Examples

Programming Language: PHP
Class/Type: spider
Method/Function: formatted_page_info
Examples at hotexamples.com: 1
PHP spider::formatted_page_info - 1 examples found. These are the top rated real world PHP examples of spider::formatted_page_info extracted from open source projects. You can rate examples to help us improve the quality of examples.
Frequently Used Methods
Show Hide
url(7)
project(4)
checker(4)
fulltxt(4)
curl_proxy(3)
rid(3)
rule(3)
calculate_scrape_details(2)
publish(2)
referer(2)
spider(2)
saveSql(1)
proxy_array(1)
ruleTest(1)
run(1)
absolutePath(1)
scrape_params(1)
setCharset(1)
pid(1)
set_limit(1)
sid(1)
sites(1)
spider_url_ids(1)
title(1)
urlslast(1)
useragent(1)
postArgs(1)
links(1)
output(1)
dataTest(1)
addLayer(1)
addStartUrl(1)
allHtml(1)
charset(1)
cid(1)
content_error_code(1)
content_right_code(1)
cookie(1)
crawl(1)
drawSpider(1)
max_depth(1)
encoding(1)
fetch_info(1)
formatted_page_info(1)
getThreads(1)
get_error(1)
get_info(1)
get_insite_links(1)
get_list(1)
get_main_html_block(1)
Example #1
Show file
File: class-site-importer-admin.php Project: venturepact/blog
    /**
     * Lets spider the site
     * @param string $spider_type Set to either spider the site or just one page for testing
     */
    public function spider_site($spider_type)
    {
        print '<div id="esi_spider">';
        global $wpdb;
        $table = $wpdb->prefix . self::$db_table;
        if ($wpdb->get_var('SHOW TABLES LIKE \'' . $table . '\' ') != $table) {
            print '<div class="error"><p>The results database table(' . $table . ') does not exist attempting to recreate it</p></div>';
            $this->activate();
        }
        $error = '';
        if ($this->domain == '') {
            $error = '<p>The main domain name is not set so can not spider this site. Enter a Website URL within the \'Crawling and Scraping settings\' tab</p>';
        }
        if (get_option('mainHTMLBlock') == '') {
            $error .= '<p>The main HTML Block is not set so can locate the part of the pages that you which to scrape in. Select a main HTML block from the drop down list within the \'Crawling and Scraping settings\' tab</p>';
        }
        $depth = get_option('scrapeDepth');
        if ($depth == '' || $depth == '0') {
            $error .= '<p>The max depth setting is not set correctly. Set this to a number such as 5 within the \'Crawling and Scraping settings\' tab</p>';
        }
        if ($error != '') {
            print '<h2><span class="fa fa-ambulance"></span> Unable to spider the site as this is not configured correctly</h2><div class="error">' . $error . '</div>';
            return;
        }
        if ($spider_type == 'test') {
            print '<h2><span class="fa fa-question-circle"></span> Test spider for ' . $this->domain . $this->URL . '</h2>';
            $depth = 1;
            $limit = 1;
        } else {
            $depth = get_option('scrapeDepth');
            print '<h2><span class="fa fa-cloud-download"></span> Spider Site ' . $this->domain . $this->URL . ' with a max depth ' . $depth . '</h2>
				<p>This page will spider and scrape the site and add all the results to a database</p>';
            $limit = $this->limit;
        }
        print '<div id="loading"><p id="loadingp">Spidering<br/>pages</p></div>';
        print '<script>document.getElementById(\'loading\').style.display = \'block\';</script>';
        flush();
        // $exclude_pages=array('', '/', $URL, '/latedeals/', '/beach-and-seaside-holidays/', '/villas-with-pools/', '/holiday-cottages/', '/short-breaks/');
        // $exclude_crawl_pattern=array('/^http/i', '/^www/i', '/^\/content/i', '/^\/category/i', '/^\/blog/i', '/^\/travel_guide/i', '/^\/resources/i', '/^\/null/i', '/^void/i', '/^\/short-breaks/i', '/[a-zA-Z0-9_]\z/i', '/yn.1\/\z/i');
        // $spider->exclude_crawl_pages($exclude_pages);
        // $spider->exclude_crawl_pattern($exclude_crawl_pattern);
        // $spider->ignore_HTML_start_depth(array(1 => 'Holiday Rentals by Town'));
        // $spider->ignore_HTML_end_depth(array(1 => 'class=\'footer\''));
        $remove_element = array('form');
        $strip_element = array('');
        if (get_option('stripDiv') == '1') {
            $strip_element[] = 'div';
        }
        if (get_option('stripSpan') == '1') {
            $strip_element[] = 'span';
        }
        $strip_attributes = array();
        if (get_option('stripCSS') == '1') {
            $strip_attributes[] = 'style';
        }
        if (get_option('stripClass') == '1') {
            $strip_attributes[] = 'class';
        }
        $params = array('mainHTMLBlock' => get_option('mainHTMLBlock'), 'startHTML' => get_option('startHTML'), 'endHTML' => get_option('endHTML'), 'includeStart' => get_option('includeStart'), 'includeEnd' => get_option('includeEnd'), 'removeElements' => $remove_element, 'stripElements' => $strip_element, 'stripAttributes' => $strip_attributes, 'replaceDomain' => get_option('replaceDomain'), 'importLocal' => get_option('importLocal'), 'importRemote' => get_option('importRemote'));
        $spider = new spider($this->domain, $this->URL, get_home_path() . '/wp-content/plugins/site-importer/log', FALSE);
        $spider->max_depth($depth);
        $spider->set_limit($limit);
        $spider->scrape_params($params);
        $spider->crawl($this->domain . $this->URL);
        $amount = count($spider->output, COUNT_RECURSIVE) - count($spider->output);
        // print_r($spider->output);
        // die();
        print '<script>document.getElementById(\'loadingp\').innerHTML = "Analysing Pages";</script>';
        flush();
        $urlData = $spider->formatted_page_info($spider->output);
        if ($spider_type == 'test') {
            if (count($spider->formatted_output) > 0) {
                array_rand($spider->formatted_output, 1);
                foreach ($spider->formatted_output as $page) {
                    $import_row = '';
                    $info_row = '';
                    foreach ($page as $key => $meta) {
                        if ($key == 'Images') {
                            $import_row .= '<tr><th>' . $key . '</th><td>' . count($meta) . ' images</td></tr>';
                        } elseif ($key != 'URL' && $key != 'Category' && $key != 'Scrape' && $key != 'Filter' && $key != 'Formatted') {
                            if ($key == 'Title' || $key == 'description') {
                                $import_row .= '<tr><th>' . $key . '</th><td>' . $meta . '</td></tr>';
                            } else {
                                $info_row .= '<tr><th>' . $key . '</th><td>' . $meta . '</td></tr>';
                            }
                        }
                    }
                    print '<h3>Page info for ' . $page['URL'] . '</h3>';
                    print '<table class="white">';
                    print '<tr><th colspan="2" class="grey">Page Info</th></tr>' . $info_row;
                    print '<tr><th colspan="2" class="grey">Import Data</th></tr>' . $import_row;
                    print '<tr><th colspan="2" class="grey">Key</th></tr>';
                    print '<th class="red" width="150">red</th><td>items will be filtered out when imported into the site</td></tr>';
                    print '<th class="green">Green</th><td>item have been amended from the original</td></tr>';
                    print '</table>';
                }
                print '<h3>Scraped HTML sample</h2><div id="formatted_html">' . $page['Formatted'] . '</div>';
            } else {
                print '<div class="error"><p>Unable to display details for page ' . $page['URL'] . '- no results(' . count($spider->formatted_output) . ')</p></div>';
            }
        } else {
            print '<h3>Spidering results</h3>';
            print 'Found <b>' . $amount . '</b> URLS from(' . $this->domain . $this->URL . ')';
            if (count($spider->formatted_output) != $amount) {
                print ' but only <b>' . count($spider->formatted_output) . '</b> had the type of structure to be imported';
                print '(' . htmlentities(get_option('startHTML')) . ') ' . "\n<br />";
            }
            $wpdb->query('TRUNCATE ' . $wpdb->prefix . self::$db_table);
            $correct_pages = '';
            $post_type = get_option('postType');
            foreach ($spider->formatted_output as $page_URL => $URL) {
                $keywords = '';
                $description = '';
                $title = '';
                $images = '';
                if (isset($URL['keywords'])) {
                    $keywords = $URL['keywords'];
                }
                if (isset($URL['description'])) {
                    $description = $URL['description'];
                }
                if (isset($URL['Title'])) {
                    $title = $URL['Title'];
                }
                if (isset($URL['Images'])) {
                    $images = serialize($URL['Images']);
                }
                $post_name = $this->get_post_name($URL['URL']);
                $SQL = 'INSERT INTO `' . $wpdb->prefix . self::$db_table . '` ';
                $SQL .= '(`url`, `category`, `title`, `description`, `keywords`, `pre_text`, `post_text`, `display_text`, `images`, `post_name`, `post_type`) ';
                $SQL .= ' VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )';
                // print $SQL."\n<br />";
                $wpdb->query($wpdb->prepare($SQL, array($URL['URL'], $URL['Category'], $title, $description, $keywords, $URL['Scrape'], $URL['Filter'], $URL['Formatted'], $images, $post_name, $post_type)));
                if ($URL['URL'] == '') {
                    $URL['URL'] = $this->domain . $URL['URL'];
                }
                $correct_pages .= 'Found <a href="' . $this->domain . $URL['URL'] . '" target="_blank">' . $URL['URL'] . '</a><br />';
            }
            print '<h3>Spidering details</h3><p></p>';
            $no_valid_html = '';
            $no_html_block = '';
            foreach ($spider->error as $message) {
                if ($message['type'] == 'error') {
                    if ($message['page'] == '') {
                        $message['page'] = $this->domain . $message['page'];
                    }
                    if ($message['errno'] == '1') {
                        $no_valid_html .= 'Not Valid Page - <a href="' . $this->domain . $message['page'] . '" target="_blank">' . $message['page'] . '</a><br />';
                    } elseif ($message['errno'] == '2') {
                        $no_html_block .= 'No HTML block <a href="' . $this->domain . $message['page'] . '" target="_blank">' . $message['page'] . '</a><br />';
                    } else {
                        // print 'ERROR ('.$message['errno'].') - '.$message['error'].' <a href="'.$this->domain.$message['page'].'" target="_blank">'.$message['page'].'</a><br />';
                    }
                }
                if ($message['type'] == 'warning') {
                    // print 'Warning '.$message['error']."(".$message['page'].")<br/>";
                }
            }
            if ($no_valid_html != '') {
                print '<p><b>The following pages are not valid</b></p>' . $no_valid_html;
            }
            if ($no_html_block != '') {
                print '<p><b>The following pages have no main HTML block</b></p>' . $no_html_block;
            }
            if ($correct_pages != '') {
                print '<p><b>The following pages are correct</b></p>' . $correct_pages;
            }
            print '<br />Finished spidering site and you can <a href="?page=site_importer&tab=review">review and import the results here</a>';
            print '<script>var tab=document.getElementById(\'esitab_review\'); tab.style.pointerEvents = \'auto\'; tab.style.color= \'#555555\'</script>';
        }
        print '<script>document.getElementById(\'loading\').style.display = \'none\';</script>';
        print '</div>';
    }