function __construct($url) { parent::__construct(); parent::loadPage($url); $this->nomi = array(); $this->siti = array(); }
function bestemmie_are_coming() { //$pattern='/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u'; $pattern = '/[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F]/u'; $content = file_get_contents($this->url); //elimino caratteri non utf-8 $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8'); //elimino caratteri non stampabili $new = preg_replace($pattern, '', $content); $this->to_clean = time() . ".tmp"; $pagina = fopen($this->to_clean, "w"); fwrite($pagina, $new); fclose($pagina); $base = $_SERVER['SERVER_NAME']; $last = $_SERVER['PHP_SELF']; $last = parent::getInitialURL($last); parent::loadPage("http://" . $base . $last . $this->to_clean); }
<?php //scritto da Davide Quadrelli header("Content-Type:html;charset=UTF-8"); require_once 'include/Scraper.php'; $toret = array(); if (isset($_GET['url'])) { $scraper = new Scraper(); $scraper->loadPage($_GET["url"], true); if (strpos($_GET['url'], "dlib")) { //pagina di d-lib $scraper->XPathFilter("html/body[1]/form/table[3]/tr[1]/td[1]/table[5]/tr[1]/td[1]/table[1]/tr[1]/td[2]/node()"); $toret[0] = "form1_table3_tr1_td1_table5_tr1_td1_table1_tr1_td2_"; } else { if (strpos($_GET['url'], "unibo")) { //articolo di almajournal $toret[0] = "div1_div3_div2_"; /*$scraper->XPathFilter("//div[@id=\"articleTitle\"] | //div[@id=\"authorString\"] | //div[@id=\"articleAbstract\"] | //div[@id=\"articleSubject\"] | //div[@id=\"articleFullText\"] | //div[@id=\"authorString\"] | //div[@id=\"articleCitations\"] | //a[@id=\"pub-id::doi\"]");*/ $scraper->XPathFilter("//div[@id=\"main\"]/node()"); if ($scraper->toString() == "") { //è un sito unibo ma non un articolo $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } else { $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } $toret[1] = $scraper->toString();