function scrape() { $riga = 2; $i = 0; do { //scorre le righe $filter = "//table[2]/tr[{$riga}]/th[1]"; parent::XPathFilter($filter); $tmp = parent::toString('txt'); if ($tmp == "") { continue; } $this->siti[$i] = $tmp; $filter = "//table[2]/tr[{$riga}]/th[2]"; parent::XPathFilter($filter); $tmp = parent::toString('txt'); $this->nomi[$i] = $tmp; $riga++; $i++; } while ($tmp != ""); $out = ""; foreach ($this->nomi as $k => $v) { $a = trim($this->nomi[$k]); $b = trim($this->siti[$k]); $out .= "{\"nome\" : \"{$a}\",\"sito\" : \"{$b}\"},"; } $out = substr($out, 0, -1); return "[" . $out . "]"; }
function unibo() { $this->location = "div1_div3_div2"; //titolo parent::XPathFilter("//div[@id=\"articleTitle\"]/h3"); $this->title = parent::toString("txt"); $this->title = trim($this->title); $this->locations["title"] = "_div3_div2_h31"; $this->start_array["title"] = 0; $this->end_array["title"] = mb_strlen($this->title, "UTF-8"); //anno pubblicazione parent::XPathFilter("//div[@id=\"breadcrumb\"]//a[@target=\"_parent\"][2]"); $this->year = parent::toString("txt"); $arr = explode(" ", $this->year); $arr2 = explode("(", $this->year); $this->year = $arr[count($arr) - 1]; $this->year = substr($this->year, 1, 4); $this->locations["year"] = "_div2_a2"; $start = mb_strlen($arr2[0], "UTF-8") + 1; $this->start_array["year"] = $start; $this->end_array["year"] = $start + mb_strlen($this->year, "UTF-8"); //autori parent::XPathFilter("//div[@id=\"authorString\"]/em"); $autore = parent::toString("txt"); $this->authors = explode(", ", $autore); if (strcmp($this->authors[0], "") == 0) { //dato che l'explode di una stringa vuota da come risultato un array di un elemento con chiave 0 e valore stringa vuota è necessario questo controllo $this->authors = null; } else { $this->locations["authors"] = "_div3_div3_em1"; $start = 0; $end = 0; foreach ($this->authors as $key => $val) { $this->authors[$key] = trim($val); $val = $this->authors[$key]; if ($key == 0) { $start = 0; $end = mb_strlen($val, "UTF-8"); $this->start_array[$key] = $start; $this->end_array[$key] = $end; } else { $start = $end + 2; $end = $start + mb_strlen($val, "UTF-8"); $this->start_array[$key] = $start; $this->end_array[$key] = $end; } } //inutili per unibo, ma servono per dare uniformità con dlib $this->locations["firstauthors"] = ""; $this->locations["continuation_of_authors"] = ""; } //doi parent::XPathFilter("//a[@id=\"pub-id::doi\"]"); $this->doi = parent::toString("txt"); $this->doi = trim($this->doi); $this->locations["doi"] = "_div3_a1"; $this->start_array["doi"] = 0; $this->end_array["doi"] = mb_strlen($this->doi, "UTF-8"); //abstract parent::XPathFilter("//div[@id=\"articleAbstract\"]//div"); $this->abstract = parent::toString("txt"); $this->abstract = trim($this->abstract); $tmp = parent::toString(); //prendo il nodo con ancora i tag html per controllare se il contenuto dell' abstract direttamente nel div o in un p dentro al div $tmp = substr($tmp, 5, 3); $this->locations["abstract"] = "_div3_div4_div1"; if (strpos($tmp, "<p") !== FALSE) { $this->locations["abstract"] .= "_p1"; } $this->start_array["abstract"] = 0; $this->end_array["abstract"] = mb_strlen($this->abstract, "UTF-8"); //su unibo le intro sono al più dentro i documenti pdf, quindi non è richiesto che lo scraper le sappia prendere //citazioni $set = null; parent::XPathFilter("//body//div[@id=\"articleCitations\"]/div[1]//p"); $set = $this->nodes->each(function ($node) { return $node->text(); }); $citazione = null; foreach ($set as $nann => $v) { $title = null; $year = null; $doi = null; $url = null; $authors = array(); $tmp = 0; //target $pos = $nann + 1; $this->locations["citos"][$nann] = "_div3_div7_div1_p{$pos}"; $this->start_array["citos"][$nann] = 0; $this->end_array["citos"][$nann] = $this->start_array["citos"][$nann] + mb_strlen(trim($v), "UTF-8"); //autori $authors = getAuthorsUnibo($v); //anno della citazione $year = getYear($v, $this->year); //titolo $title = getTitleUnibo($v, $year, $authors); //doi e url della citazione $coppia = getDoiAndURL($v); if (isset($coppia[0])) { $url = $coppia[0]; } if (isset($coppia[1])) { $doi = $coppia[1]; } $this->citations[$nann] = new citationcontainer($v, $title, $year, $authors, $doi, $url); /*echo "<br><br> Annotazione $nann:<br>"; echo "Titolo trovato: $title<br>"; echo "Anno trovato: $year<br>"; echo "Url trovato: $url<br>"; echo "Doi trovato: $doi<br>"; echo "Lista autori:<br>"; foreach ($authors as $k=>$val){ echo "Autore $k: $val<br>"; } echo "Target:<br>"; echo "location: ".$this->location.$this->locations["citos"][$nann]."<br>"; echo "start: ".$this->start_array["citos"][$nann]."<br>"; echo "end: ".$this->end_array["citos"][$nann]."<br>"; echo "<br>"; */ } }
//scritto da Davide Quadrelli header("Content-Type:html;charset=UTF-8"); require_once 'include/Scraper.php'; $toret = array(); if (isset($_GET['url'])) { $scraper = new Scraper(); $scraper->loadPage($_GET["url"], true); if (strpos($_GET['url'], "dlib")) { //pagina di d-lib $scraper->XPathFilter("html/body[1]/form/table[3]/tr[1]/td[1]/table[5]/tr[1]/td[1]/table[1]/tr[1]/td[2]/node()"); $toret[0] = "form1_table3_tr1_td1_table5_tr1_td1_table1_tr1_td2_"; } else { if (strpos($_GET['url'], "unibo")) { //articolo di almajournal $toret[0] = "div1_div3_div2_"; /*$scraper->XPathFilter("//div[@id=\"articleTitle\"] | //div[@id=\"authorString\"] | //div[@id=\"articleAbstract\"] | //div[@id=\"articleSubject\"] | //div[@id=\"articleFullText\"] | //div[@id=\"authorString\"] | //div[@id=\"articleCitations\"] | //a[@id=\"pub-id::doi\"]");*/ $scraper->XPathFilter("//div[@id=\"main\"]/node()"); if ($scraper->toString() == "") { //è un sito unibo ma non un articolo $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } else { $toret[0] = "body1_"; $scraper->XPathFilter("//body"); } } $toret[1] = $scraper->toString(); echo json_encode($toret); }