public function __construct($url) { if (!preg_match('!^https?://!i', $url)) { $url = 'http://' . $url; } $data = Http::Request($url); //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII"); $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII"); //$html = utf8_encode($html); $r = new Readability($html, $url); $r->init(); if (!isset($this->metadata["title"])) { $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML)); } if (!isset($this->metadata["author"])) { $parts = parse_url($url); $this->metadata["author"] = $parts["host"]; } $article = $r->getContent()->innerHTML; if (substr($article, 0, 5) == "<body") { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>"; } else { $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>"; } $doc = new DOMDocument(); @$doc->loadHTML($article) or die($article); $doc->normalizeDocument(); $this->images = $this->handleImages($doc, $url); $this->text = $doc->saveHTML(); }
/** * Set the data to use. * * @param string $data Data to put in the file */ public function setData($data) { //$data = utf8_encode($data); $data = CharacterEntities::convert($data); //$data = utf8_decode($data); //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data); $this->source = $data; $this->prc = false; }
/** * Set the data to use * @param string $data Data to put in the file */ public function setData($data) { //$data = utf8_encode($data); $data = CharacterEntities::convert($data); //$data = utf8_decode($data); //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data); $images = array(); // image handling stuff $dom = new DOMDocument(); $dom->loadHTML($data) or die($data); $dom->normalizeDocument(); //exit(); $savedImages = array(); $imgElements = $dom->getElementsByTagName('img'); foreach ($imgElements as $img) { $src = $img->getAttribute("src"); $is_root = false; if (substr($src, 0, 1) == "/") { $is_root = true; } /*$parsed = parse_url($src); if(!isset($parsed["host"])){ if($is_root){ $src = http_build_url($url, $parsed, HTTP_URL_REPLACE); }else{ $src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH); } }*/ $img->setAttribute("src", ""); if (isset($savedImages[$src])) { $img->setAttribute("recindex", $savedImages[$src]); } else { $image = ImageHandler::DownloadImage($src); if ($image !== false) { $images[$this->imgCounter] = new FileRecord(new Record($image)); $img->setAttribute("recindex", $this->imgCounter); $savedImages[$src] = $this->imgCounter; $this->imgCounter++; } } } $this->images = $images; //end image stuff $data = $dom->saveXML(); $data = str_replace("<pagebreak/>", "<mbp:pagebreak/>", $data); $data = str_replace("<pagebreak></pagebreak>", "<mbp:pagebreak/>", $data); // echo $data; // print_r($this->images); $this->source = $data; $this->prc = false; }