示例#1
0
 public function __construct($url)
 {
     if (!preg_match('!^https?://!i', $url)) {
         $url = 'http://' . $url;
     }
     $data = Http::Request($url);
     //$enc = mb_detect_encoding($str, "UTF-8,ISO-8859-1,ASCII");
     $html = mb_convert_encoding($data, "UTF-8", "UTF-8,ISO-8859-1,ASCII");
     //$html = utf8_encode($html);
     $r = new Readability($html, $url);
     $r->init();
     if (!isset($this->metadata["title"])) {
         $this->metadata["title"] = CharacterEntities::convert(strip_tags($r->getTitle()->innerHTML));
     }
     if (!isset($this->metadata["author"])) {
         $parts = parse_url($url);
         $this->metadata["author"] = $parts["host"];
     }
     $article = $r->getContent()->innerHTML;
     if (substr($article, 0, 5) == "<body") {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head>" . $article . "</html>";
     } else {
         $article = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=UTF-8'/></head><body>" . $article . "</body></html>";
     }
     $doc = new DOMDocument();
     @$doc->loadHTML($article) or die($article);
     $doc->normalizeDocument();
     $this->images = $this->handleImages($doc, $url);
     $this->text = $doc->saveHTML();
 }
示例#2
0
 public static function convert($str)
 {
     //Assume the encoding is UTF-8 -> output is UTF-8
     return $str;
     //return utf8_encode($str);
     //Convert to CP1252
     list($from, $to) = CharacterEntities::generateTables();
     return str_replace($from, $to, $str);
 }
示例#3
0
 /**
  * Set the data to use.
  *
  * @param string $data Data to put in the file
  */
 public function setData($data)
 {
     //$data = utf8_encode($data);
     $data = CharacterEntities::convert($data);
     //$data = utf8_decode($data);
     //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data);
     $this->source = $data;
     $this->prc = false;
 }
 /**
  * Set the data to use
  * @param string $data Data to put in the file
  */
 public function setData($data)
 {
     //$data = utf8_encode($data);
     $data = CharacterEntities::convert($data);
     //$data = utf8_decode($data);
     //$this->source = iconv('UTF-8', 'ISO-8859-1//TRANSLIT', $data);
     $images = array();
     // image handling stuff
     $dom = new DOMDocument();
     $dom->loadHTML($data) or die($data);
     $dom->normalizeDocument();
     //exit();
     $savedImages = array();
     $imgElements = $dom->getElementsByTagName('img');
     foreach ($imgElements as $img) {
         $src = $img->getAttribute("src");
         $is_root = false;
         if (substr($src, 0, 1) == "/") {
             $is_root = true;
         }
         /*$parsed = parse_url($src);
         	
         				if(!isset($parsed["host"])){
         					if($is_root){
         						$src = http_build_url($url, $parsed, HTTP_URL_REPLACE);
         					}else{
         						$src = http_build_url($url, $parsed, HTTP_URL_JOIN_PATH);
         					}
         				}*/
         $img->setAttribute("src", "");
         if (isset($savedImages[$src])) {
             $img->setAttribute("recindex", $savedImages[$src]);
         } else {
             $image = ImageHandler::DownloadImage($src);
             if ($image !== false) {
                 $images[$this->imgCounter] = new FileRecord(new Record($image));
                 $img->setAttribute("recindex", $this->imgCounter);
                 $savedImages[$src] = $this->imgCounter;
                 $this->imgCounter++;
             }
         }
     }
     $this->images = $images;
     //end image stuff
     $data = $dom->saveXML();
     $data = str_replace("<pagebreak/>", "<mbp:pagebreak/>", $data);
     $data = str_replace("<pagebreak></pagebreak>", "<mbp:pagebreak/>", $data);
     //			echo $data;
     //			print_r($this->images);
     $this->source = $data;
     $this->prc = false;
 }