function scrape()
 {
     $riga = 2;
     $i = 0;
     do {
         //scorre le righe
         $filter = "//table[2]/tr[{$riga}]/th[1]";
         parent::XPathFilter($filter);
         $tmp = parent::toString('txt');
         if ($tmp == "") {
             continue;
         }
         $this->siti[$i] = $tmp;
         $filter = "//table[2]/tr[{$riga}]/th[2]";
         parent::XPathFilter($filter);
         $tmp = parent::toString('txt');
         $this->nomi[$i] = $tmp;
         $riga++;
         $i++;
     } while ($tmp != "");
     $out = "";
     foreach ($this->nomi as $k => $v) {
         $a = trim($this->nomi[$k]);
         $b = trim($this->siti[$k]);
         $out .= "{\"nome\" : \"{$a}\",\"sito\" : \"{$b}\"},";
     }
     $out = substr($out, 0, -1);
     return "[" . $out . "]";
 }
 function unibo()
 {
     $this->location = "div1_div3_div2";
     //titolo
     parent::XPathFilter("//div[@id=\"articleTitle\"]/h3");
     $this->title = parent::toString("txt");
     $this->title = trim($this->title);
     $this->locations["title"] = "_div3_div2_h31";
     $this->start_array["title"] = 0;
     $this->end_array["title"] = mb_strlen($this->title, "UTF-8");
     //anno pubblicazione
     parent::XPathFilter("//div[@id=\"breadcrumb\"]//a[@target=\"_parent\"][2]");
     $this->year = parent::toString("txt");
     $arr = explode(" ", $this->year);
     $arr2 = explode("(", $this->year);
     $this->year = $arr[count($arr) - 1];
     $this->year = substr($this->year, 1, 4);
     $this->locations["year"] = "_div2_a2";
     $start = mb_strlen($arr2[0], "UTF-8") + 1;
     $this->start_array["year"] = $start;
     $this->end_array["year"] = $start + mb_strlen($this->year, "UTF-8");
     //autori
     parent::XPathFilter("//div[@id=\"authorString\"]/em");
     $autore = parent::toString("txt");
     $this->authors = explode(", ", $autore);
     if (strcmp($this->authors[0], "") == 0) {
         //dato che l'explode di una stringa vuota da come risultato un array di un elemento con chiave 0 e valore stringa vuota è necessario questo controllo
         $this->authors = null;
     } else {
         $this->locations["authors"] = "_div3_div3_em1";
         $start = 0;
         $end = 0;
         foreach ($this->authors as $key => $val) {
             $this->authors[$key] = trim($val);
             $val = $this->authors[$key];
             if ($key == 0) {
                 $start = 0;
                 $end = mb_strlen($val, "UTF-8");
                 $this->start_array[$key] = $start;
                 $this->end_array[$key] = $end;
             } else {
                 $start = $end + 2;
                 $end = $start + mb_strlen($val, "UTF-8");
                 $this->start_array[$key] = $start;
                 $this->end_array[$key] = $end;
             }
         }
         //inutili per unibo, ma servono per dare uniformità con dlib
         $this->locations["firstauthors"] = "";
         $this->locations["continuation_of_authors"] = "";
     }
     //doi
     parent::XPathFilter("//a[@id=\"pub-id::doi\"]");
     $this->doi = parent::toString("txt");
     $this->doi = trim($this->doi);
     $this->locations["doi"] = "_div3_a1";
     $this->start_array["doi"] = 0;
     $this->end_array["doi"] = mb_strlen($this->doi, "UTF-8");
     //abstract
     parent::XPathFilter("//div[@id=\"articleAbstract\"]//div");
     $this->abstract = parent::toString("txt");
     $this->abstract = trim($this->abstract);
     $tmp = parent::toString();
     //prendo il nodo con ancora i tag html per controllare se il contenuto dell' abstract direttamente nel div o in un p dentro al div
     $tmp = substr($tmp, 5, 3);
     $this->locations["abstract"] = "_div3_div4_div1";
     if (strpos($tmp, "<p") !== FALSE) {
         $this->locations["abstract"] .= "_p1";
     }
     $this->start_array["abstract"] = 0;
     $this->end_array["abstract"] = mb_strlen($this->abstract, "UTF-8");
     //su unibo le intro sono al più dentro i documenti pdf, quindi non è richiesto che lo scraper le sappia prendere
     //citazioni
     $set = null;
     parent::XPathFilter("//body//div[@id=\"articleCitations\"]/div[1]//p");
     $set = $this->nodes->each(function ($node) {
         return $node->text();
     });
     $citazione = null;
     foreach ($set as $nann => $v) {
         $title = null;
         $year = null;
         $doi = null;
         $url = null;
         $authors = array();
         $tmp = 0;
         //target
         $pos = $nann + 1;
         $this->locations["citos"][$nann] = "_div3_div7_div1_p{$pos}";
         $this->start_array["citos"][$nann] = 0;
         $this->end_array["citos"][$nann] = $this->start_array["citos"][$nann] + mb_strlen(trim($v), "UTF-8");
         //autori
         $authors = getAuthorsUnibo($v);
         //anno della citazione
         $year = getYear($v, $this->year);
         //titolo
         $title = getTitleUnibo($v, $year, $authors);
         //doi e url della citazione
         $coppia = getDoiAndURL($v);
         if (isset($coppia[0])) {
             $url = $coppia[0];
         }
         if (isset($coppia[1])) {
             $doi = $coppia[1];
         }
         $this->citations[$nann] = new citationcontainer($v, $title, $year, $authors, $doi, $url);
         /*echo "<br><br> Annotazione $nann:<br>";
         		
         		echo "Titolo trovato: $title<br>";
         		echo "Anno trovato: $year<br>";
         		echo "Url trovato: $url<br>";
         		echo "Doi trovato: $doi<br>";
         		echo "Lista autori:<br>";
         		foreach ($authors as $k=>$val){
         			echo "Autore $k: $val<br>";
         		}
         		echo "Target:<br>";
         		echo "location: ".$this->location.$this->locations["citos"][$nann]."<br>";
         		echo "start: ".$this->start_array["citos"][$nann]."<br>";
         		echo "end: ".$this->end_array["citos"][$nann]."<br>";
         		echo "<br>";
         		*/
     }
 }
Пример #3
0
<?php

//scritto da Davide Quadrelli
header("Content-Type:html;charset=UTF-8");
require_once 'include/Scraper.php';
$toret = array();
if (isset($_GET['url'])) {
    $scraper = new Scraper();
    $scraper->loadPage($_GET["url"], true);
    if (strpos($_GET['url'], "dlib")) {
        //pagina di d-lib
        $scraper->XPathFilter("html/body[1]/form/table[3]/tr[1]/td[1]/table[5]/tr[1]/td[1]/table[1]/tr[1]/td[2]/node()");
        $toret[0] = "form1_table3_tr1_td1_table5_tr1_td1_table1_tr1_td2_";
    } else {
        if (strpos($_GET['url'], "unibo")) {
            //articolo di almajournal
            $toret[0] = "div1_div3_div2_";
            /*$scraper->XPathFilter("//div[@id=\"articleTitle\"] | //div[@id=\"authorString\"] | //div[@id=\"articleAbstract\"] | //div[@id=\"articleSubject\"]
            	 | //div[@id=\"articleFullText\"] | //div[@id=\"authorString\"] | //div[@id=\"articleCitations\"] | //a[@id=\"pub-id::doi\"]");*/
            $scraper->XPathFilter("//div[@id=\"main\"]/node()");
            if ($scraper->toString() == "") {
                //è un sito unibo ma non un articolo
                $toret[0] = "body1_";
                $scraper->XPathFilter("//body");
            }
        } else {
            $toret[0] = "body1_";
            $scraper->XPathFilter("//body");
        }
    }
    $toret[1] = $scraper->toString();