function performReadability($html, $config, $link) { Feediron_Logger::get()->log(Feediron_Logger::LOG_VERBOSE, "Using Readability"); require_once 'php-readability/Readability.php'; require_once 'php-readability/JSLikeHTMLElement.php'; $readability = new Readability\Readability($html, $link); $readability->debug = false; $readability->convertLinksToFootnotes = true; $result = $readability->init(); if (!$result) { Feediron_Logger::get()->log(Feediron_Logger::LOG_VERBOSE, "Readability failed to find content"); return $html; } else { $content = $readability->getContent()->innerHTML; // if we've got Tidy, let's clean it up for output if (function_exists('tidy_parse_string')) { $tidy = tidy_parse_string($content, array('indent' => true, 'show-body-only' => true), 'UTF8'); $tidy->cleanRepair(); $content = $tidy->value; } return $content; } }
} else { $url = "www." . $url; } $userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)'; // Hacemos el CURL Request $ch = curl_init(); curl_setopt($ch, CURLOPT_USERAGENT, $userAgent); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_FAILONERROR, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 10); $html = curl_exec($ch); // Pasamos el html a Readability $readability = new Readability\Readability($html, $url); $result = $readability->init(); if ($result) { $titulo = $readability->getTitle()->textContent . ". "; //Eliminamos caracteres especiales y dejamos solo utf8 $titulo = preg_replace('/[^0-9\\p{L}`_,;@#%&~\\.\\:\\"\'\\+\\-\\*\\s\\?\\[\\^\\]\\$\\(\\)\\{\\}\\=\\!\\<\\>\\|\\xBF\\xA1\\xAE\\xA9\\\\]/u', '', $titulo); $titulo = trim($titulo); // Eliminamos espacios del inicio y final del string $titulo = preg_replace('/\\s+/', ' ', $titulo); // Reducimos multiples espacios en blanco a solo 1 $articulo = $readability->getContent()->textContent; //Eliminamos caracteres especiales y dejamos solo utf8 $articulo = preg_replace('/[^0-9\\p{L}`_,;@#%&~\\.\\:\\"\'\\+\\-\\*\\s\\?\\[\\^\\]\\$\\(\\)\\{\\}\\=\\!\\<\\>\\|\\xBF\\xA1\\xAE\\xA9\\\\]/u', '', $articulo); $articulo = trim($articulo); // Eliminamos espacios del inicio y final del string $articulo = preg_replace('/\\s+/', ' ', $articulo);