Esempio n. 1
0
function outputScrape($url)
{
    $returnvalue = "";
    $data1 = preg_replace('#(<\\/ul>\\s*)+#i', '</ul>', scrapePage($url));
    $data1a = str_replace('files//', 'http://resources21.org/cl/files/', $data1);
    $data2 = preg_replace(array('/<head>(.*)<\\/head>/iUs', '/<html>/', '/<\\/html>/', '/<body>/', '/<\\/body>/', '/<\\/td><td>/'), "", $data1a);
    $returnvalue .= '<div id="lessonplan-content">';
    $doc = new DOMDocument();
    $doc->loadHTML($data2);
    libxml_use_internal_errors(false);
    $xpath = new DOMXpath($doc);
    $elements = $xpath->query('//ul');
    $elementsh = $xpath->query("//*[@class='secHed']");
    $header = array();
    if (!is_null($elementsh)) {
        foreach ($elementsh as $elementh) {
            $header[] = "<h3>" . $elementh->nodeValue . "</h3>";
        }
    }
    if (!is_null($elements)) {
        $i = 0;
        foreach ($elements as $element) {
            $returnvalue .= '<div id="' . $element->getAttribute('id') . '">' . $header[$i] . "<ul>";
            $nodes = $element->childNodes;
            foreach ($nodes as $node) {
                $returnvalue .= '<li>' . innerXML($node) . "</li>\n";
            }
            $returnvalue .= "</ul></div>";
            $i++;
        }
        $returnvalue .= "</div>";
    }
    // $returnvalue = "<[CDATA[ " . $returnvalue ." ]]>";
    return $returnvalue;
}
            scraperWiki::save_sqlite(array('name'), $payload);
        }
    }
}
scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-2.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-3.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-4.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-5.html");
require 'scraperwiki/simple_html_dom.php';
function scrapePage($url)
{
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cells = $dom->find('td.nom');
    foreach ($cells as $cell) {
        $name = $cell->find('a', 0)->plaintext;
        $parent = $cell->parent();
        $count = $parent->find('td.compte', 0)->plaintext;
        if ($count) {
            $payload = array('name' => $name, 'count' => $count);
            scraperWiki::save_sqlite(array('name'), $payload);
        }
    }
}
scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-2.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-3.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-4.html");
//scrapePage("http://surname.sofeminine.co.uk/w/surnames/most-common-surnames-in-great-britain-5.html");