/** * @dataProvider dataProvider */ public function testProcess($expected, $actual, $allowedTags) { $this->items[0]->setIntro($actual); $this->items[0]->setContent($actual); $this->processor->setAllowedTagsForContent($allowedTags); $this->processor->setAllowedTagsForIntro($allowedTags); $this->items = $this->processor->process($this->items); $this->assertEquals($expected, $this->items[0]->getIntro()); $this->assertEquals($expected, $this->items[0]->getContent()); }
/** * Scrape the RSS feed * @since Version 3.9 * @return \Railpage\News\Scraper */ public function fetch() { if (!is_string($this->feed)) { throw new Exception("Cannot fetch news articles from RSS feed because no RSS feed was provided"); } $articles = array(); $FastFeed = FastFeedFactory::create(); $FastFeed->addFeed('default', $this->feed); $FastFeed->pushProcessor(new RemoveStylesProcessor()); #$FastFeed->pushParser(new RailpageParser); /** * Remove tags */ $StripTagsProcessor = new StripTagsProcessor(); $StripTagsProcessor->setAllowedTagsForContent("img, a, ul, li, ol, strong, i, em, table, tr, td, th, thead, tbody, tfoot"); $StripTagsProcessor->setAllowedTagsForIntro("a, ul, li, ol, strong, i, em, table, tr, td, th, thead, tbody, tfoot"); $FastFeed->pushProcessor($StripTagsProcessor); $items = $FastFeed->fetch('default'); printArray($items); die; foreach ($items as $Item) { $content = $Item->getContent(); #printArray($Item->getExtra("category")); $date = $Item->getDate(); $row = array("title" => $Item->getName(), "date" => $date->setTimeZone(new DateTimeZone("Australia/Melbourne")), "source" => $Item->getSource(), "blurb" => $Item->getIntro(), "body" => $Item->getContent(), "topic" => News::guessTopic($topic)); printArray($row); die; } $articles[] = $row; $this->articles = $articles; /** * Zend HTTP config */ $config = array('adapter' => 'Zend\\Http\\Client\\Adapter\\Curl', 'curloptions' => array(CURLOPT_FOLLOWLOCATION => true)); $client = new Client($this->feed, $config); /** * Fetch the RSS feed */ $response = $client->send(); $content = $response->getBody(); /** * Load the SimpleXML object */ $xml = new SimpleXMLElement($content); /** * Load the namespaces */ $ns = $xml->getNamespaces(true); /** * Loop through each RSS item and build an associative array of the data we need */ foreach ($xml->channel->item as $item) { if (isset($ns['content']) && !empty($ns['content'])) { $content = $item->children($ns['content']); $content = strval($content->encoded); } else { $content = $item->description->__toString(); $content = strip_tags($content, "img,a"); } #printArray($content->__toString());die; $topic = json_decode(json_encode($item->category), true); if (empty($topic)) { $topic = $this->feed; } $line = explode("\n", $content); $firstline = preg_replace('/([^?!.]*.).*/', '\\1', strip_tags($line[0])); $body = trim(str_replace($firstline, "", $content)); $row = array("title" => strval($item->title), "date" => (new DateTime(strval($item->pubDate)))->setTimeZone(new DateTimeZone("Australia/Melbourne")), "source" => strval($item->link), "blurb" => $firstline, "body" => $body, "topic" => News::guessTopic($topic)); /** * Add this article to the list of news articles found in this scrape */ $articles[] = $row; } $this->articles = $articles; return $this; }