コード例 #1
0
 public function collectData(array $param)
 {
     $html = '';
     $html = $this->file_get_html('http://www.courrierinternational.com/') or $this->returnError('Error.', 500);
     $element = $html->find("article");
     $article_count = 1;
     foreach ($element as $article) {
         $item = new \Item();
         $item->uri = $article->parent->getAttribute("href");
         if (strpos($item->uri, "http") === FALSE) {
             $item->uri = "http://courrierinternational.fr/" . $item->uri;
         }
         $page = $this->file_get_html($item->uri);
         $cleaner = new HTMLSanitizer();
         $item->content = $cleaner->sanitize($page->find("div.article-text")[0]);
         $item->title = strip_tags($article->find(".title")[0]);
         $dateTime = date_parse($page->find("time")[0]);
         $item->timestamp = mktime($dateTime['hour'], $dateTime['minute'], $dateTime['second'], $dateTime['month'], $dateTime['day'], $dateTime['year']);
         $this->items[] = $item;
         $article_count++;
         if ($article_count > 5) {
             break;
         }
     }
 }
コード例 #2
0
ファイル: AcrimedBridge.php プロジェクト: ORelio/rss-bridge
 protected function parseRSSItem($newsItem)
 {
     $hs = new HTMLSanitizer();
     $namespaces = $newsItem->getNameSpaces(true);
     $dc = $newsItem->children($namespaces['dc']);
     $item = new Item();
     $item->uri = trim($newsItem->link);
     $item->title = trim($newsItem->title);
     $item->timestamp = strtotime($dc->date);
     $articlePage = $this->file_get_html($newsItem->link);
     $article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext);
     $article = HTMLSanitizer::defaultImageSrcTo($article, "http://www.acrimed.org/");
     $item->content = $article;
     return $item;
 }
コード例 #3
0
ファイル: Gawker.php プロジェクト: Nomane/rss-bridge
 protected function parseRSSItem($newsItem)
 {
     $item = new Item();
     $item->uri = trim($newsItem->link);
     $item->title = trim($newsItem->title);
     $item->timestamp = $this->RSS_2_0_time_to_timestamp($newsItem);
     //        $this->message("///////////////////////////////////////////////////////////////////////////////////////\nprocessing item ".var_export($item, true)."\n\n\nbuilt from\n\n\n".var_export($newsItem, true));
     try {
         // now load that uri from cache
         //            $this->message("loading page ".$item->uri);
         $articlePage = str_get_html($this->get_cached($item->uri));
         if (is_object($articlePage)) {
             $content = $articlePage->find('.post-content', 0);
             HTMLSanitizer::defaultImageSrcTo($content, $this->getURI());
             $vcard = $articlePage->find('.vcard', 0);
             if (is_object($vcard)) {
                 $authorLink = $vcard->find('a', 0);
                 $item->name = $authorLink->innertext;
                 // TODO use author link href to fill the feed info
             }
             //                $this->message("item quite loaded : ".var_export($item, true));
             // I set item content as last element, for easier var_export reading
             $item->content = $content->innertext;
         } else {
             throw new Exception("cache content for " . $item->uri . " is NOT a Simple DOM parser object !");
         }
     } catch (Exception $e) {
         $this->message("obtaining " . $item->uri . " resulted in exception " . $e->getMessage() . ". Deleting cached page ...");
         // maybe file is incorrect. it should be discarded from cache
         $this->remove_from_cache($item->url);
         $item->content = $e->getMessage();
     }
     return $item;
 }
コード例 #4
0
ファイル: WorldOfTanks.php プロジェクト: ORelio/rss-bridge
 public function parseLine($infoLink)
 {
     $item = new Item();
     $item->uri = WORLD_OF_TANKS . $infoLink->href;
     // now load that uri from cache
     //        $this->message("loading page ".$item->uri);
     $articlePage = str_get_html($this->get_cached($item->uri));
     $content = $articlePage->find('.l-content', 0);
     HTMLSanitizer::defaultImageSrcTo($content, WORLD_OF_TANKS);
     $item->title = $content->find('h1', 0)->innertext;
     $item->content = $content->find('.b-content', 0)->innertext;
     //        $item->name = $auteur->innertext;
     $item->timestamp = $content->find('.b-statistic_time', 0)->getAttribute("data-timestamp");
     $this->items[] = $item;
 }