Workflow:
1. Prep the document by removing script tags, css, etc.
2. Build readability's DOM tree.
3. Grab the article content from the current dom tree.
4. Replace the current DOM tree with the new one.
5. Read peacefully.
/** Extract article from a page using php-readability */ function getArticle($url) { $html = file_get_contents($url); $Readability = new Readability($html, $url); $result = $Readability->init(); $results = array('title' => $Readability->getTitle()->textContent, 'content' => $Readability->getContent()->textContent); return $results; }
function getTitle($url) { $cachedURL = str_replace('http://', 'http://webcache.googleusercontent.com/search?q=cache:', $url); $html = file_get_contents($cachedURL); $readability = new Readability($html, $url, 'libxml', false); $readabilityData = $readability->init(); if ($readability->getTitle()->textContent == "") { $html = file_get_contents($url); $readability = new Readability($html, $url, 'libxml', false); $readabilityData = $readability->init(); if ($readability->getTitle()->textContent == "") { return 'This link has no title'; } } return $readability->getTitle()->textContent; }