protected function parseRSSItem($newsItem) { $item = new Item(); $item->uri = trim($newsItem->link); $item->title = trim($newsItem->title); $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem); // $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true)); try { // now load that uri from cache // $this->message("loading page ".$item->uri); $articlePage = str_get_html($this->get_cached($item->uri)); if (is_object($articlePage)) { $content = $articlePage->find('.post-content', 0); HTMLSanitizer::defaultImageSrcTo($content, $this->getURI()); $vcard = $articlePage->find('.vcard', 0); if (is_object($vcard)) { $authorLink = $vcard->find('a', 0); $item->name = $authorLink->innertext; // TODO use author link href to fill the feed info } // $this->message("item quite loaded : ".var_export($item, true)); // I set item content as last element, for easier var_export reading $item->content = $content->innertext; } else { throw new Exception("cache content for " . $item->uri . " is NOT a Simple DOM parser object !"); } } catch (Exception $e) { $this->message("obtaining " . $item->uri . " resulted in exception " . $e->getMessage() . ". Deleting cached page ..."); // maybe file is incorrect. it should be discarded from cache $this->remove_from_cache($item->url); $item->content = $e->getMessage(); } return $item; }
protected function parseRSSItem($newsItem) { $hs = new HTMLSanitizer(); $namespaces = $newsItem->getNameSpaces(true); $dc = $newsItem->children($namespaces['dc']); $item = new Item(); $item->uri = trim($newsItem->link); $item->title = trim($newsItem->title); $item->timestamp = strtotime($dc->date); $articlePage = $this->file_get_html($newsItem->link); $article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext); $article = HTMLSanitizer::defaultImageSrcTo($article, "http://www.acrimed.org/"); $item->content = $article; return $item; }
public function parseLine($infoLink) { $item = new Item(); $item->uri = WORLD_OF_TANKS . $infoLink->href; // now load that uri from cache // $this->message("loading page ".$item->uri); $articlePage = str_get_html($this->get_cached($item->uri)); $content = $articlePage->find('.l-content', 0); HTMLSanitizer::defaultImageSrcTo($content, WORLD_OF_TANKS); $item->title = $content->find('h1', 0)->innertext; $item->content = $content->find('.b-content', 0)->innertext; // $item->name = $auteur->innertext; $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp"); $this->items[] = $item; }