Beispiel #1
0
 function performReadability($html, $config, $link)
 {
     Feediron_Logger::get()->log(Feediron_Logger::LOG_VERBOSE, "Using Readability");
     require_once 'php-readability/Readability.php';
     require_once 'php-readability/JSLikeHTMLElement.php';
     $readability = new Readability\Readability($html, $link);
     $readability->debug = false;
     $readability->convertLinksToFootnotes = true;
     $result = $readability->init();
     if (!$result) {
         Feediron_Logger::get()->log(Feediron_Logger::LOG_VERBOSE, "Readability failed to find content");
         return $html;
     } else {
         $content = $readability->getContent()->innerHTML;
         // if we've got Tidy, let's clean it up for output
         if (function_exists('tidy_parse_string')) {
             $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8');
             $tidy->cleanRepair();
             $content = $tidy->value;
         }
         return $content;
     }
 }
Beispiel #2
0
} else {
    $url = "www." . $url;
}
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
// Hacemos el CURL Request
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = curl_exec($ch);
// Pasamos el html a Readability
$readability = new Readability\Readability($html, $url);
$result = $readability->init();
if ($result) {
    $titulo = $readability->getTitle()->textContent . ". ";
    //Eliminamos caracteres especiales y dejamos solo utf8
    $titulo = preg_replace('/[^0-9\\p{L}`_,;@#%&~\\.\\:\\"\'\\+\\-\\*\\s\\?\\[\\^\\]\\$\\(\\)\\{\\}\\=\\!\\<\\>\\|\\xBF\\xA1\\xAE\\xA9\\\\]/u', '', $titulo);
    $titulo = trim($titulo);
    // Eliminamos espacios del inicio y final del string
    $titulo = preg_replace('/\\s+/', ' ', $titulo);
    // Reducimos multiples espacios en blanco a solo 1
    $articulo = $readability->getContent()->textContent;
    //Eliminamos caracteres especiales y dejamos solo utf8
    $articulo = preg_replace('/[^0-9\\p{L}`_,;@#%&~\\.\\:\\"\'\\+\\-\\*\\s\\?\\[\\^\\]\\$\\(\\)\\{\\}\\=\\!\\<\\>\\|\\xBF\\xA1\\xAE\\xA9\\\\]/u', '', $articulo);
    $articulo = trim($articulo);
    // Eliminamos espacios del inicio y final del string
    $articulo = preg_replace('/\\s+/', ' ', $articulo);