/** * @brief Fetch an uBio RSS feed, and convert to object for ease of processing * * We convert RSS to JSON to create object. We use conditional GET to check whether * feed has been modified. * * @param url Feed URL * @param data Object * * @return Result from RSS fetch (0 is OK, 304 is feed unchanged, anything else is an error) */ function ubio_fetch_rss($url, &$data) { $rss = ''; $msg = '200'; $result = GetRSS($url, $rss, true); if ($result == 0) { // Archive $dir = dirname(__FILE__) . '/tmp/' . date("Y-m-d"); if (!file_exists($dir)) { $oldumask = umask(0); mkdir($dir, 0777); umask($oldumask); } $rss_file_name = $dir . '/' . md5($url) . '.xml'; $rss_file = fopen($rss_file_name, "w+") or die("could't open file --\"{$rss_file_name}\""); fwrite($rss_file, $rss); fclose($rss_file); // Convert to JSON $xp = new XsltProcessor(); $xsl = new DomDocument(); $xsl->load(dirname(__FILE__) . '/xsl/ubiorss.xsl'); $xp->importStylesheet($xsl); $xml_doc = new DOMDocument(); $xml_doc->loadXML($rss); $json = $xp->transformToXML($xml_doc); $data = json_decode($json); } else { switch ($result) { case 304: $msg = 'Feed has not changed since last fetch (' . $result . ')'; break; default: $msg = 'Badness happened (' . $result . ') ' . $url; break; } } echo $msg, "\n"; return $result; }
die("failed [" . __LINE__ . "]: " . $sql); } } } } return $result; } if (0) { // test // $url = "http://localhost/~rpage/ants/rss/Formicidae.rss"; $url = 'http://www.connotea.org/rss/tag/phylogeny'; // $url = 'http://names.ubio.org/rss/rss_feed.php?username=rdmpage&rss1=1'; $url = 'http://taxacom.markmail.org/atom/'; $rss = ''; $msg = ''; $result = GetRSS($url, &$rss, true); if ($result == 0) { //echo $rss; if ($result == 0) { $msg = 'OK'; } } else { switch ($result) { case 304: $msg = 'Feed has not changed since last fetch (' . $result . ')'; break; default: $msg = 'Badness happened (' . $result . ')'; break; } }
function main() { $feeds = array('http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=harv', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=novi', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=afzo', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=mobt', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=cara', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=esaa', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=brvo', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=brit', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=amnb', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=acta', 'http://www.bioone.org/action/showFeed?type=etoc&feed=rss&jc=jzoo', 'http://api.ingentaconnect.com/content/rbsb/bjb/latest?format=rss', 'http://api.ingentaconnect.com/content/iapt/tax/latest?format=rss', 'http://api.ingentaconnect.com/content/schweiz/novh/latest?format=rss', 'http://api.ingentaconnect.com/content/nhn/pimj/latest?format=rss', 'http://api.ingentaconnect.com/content/rssa/trssa/latest?format=rss', 'http://api.ingentaconnect.com/content/esa/jme/latest?format=rss', 'http://api.ingentaconnect.com/content/esa/aesa/latest?format=rss', 'http://www.scielo.br/rss.php?pid=0085-562620090001&lang=en', 'http://www.scielo.br/rss.php?pid=0100-8404&lang=en', 'http://www.scielo.br/rss.php?pid=0074-0276&lang=en', 'http://www.scielo.br/rss.php?pid=0031-1049&lang=en', 'http://www.scielo.br/rss.php?pid=1519-566X&lang=en', 'http://www.scielo.br/rss.php?pid=0101-817520080004&lang=en', 'http://www.scielo.br/rss.php?pid=1679-6225&lang=en', 'http://www.scielo.br/rss.php?pid=0102-330620080004&lang=en', 'http://www3.interscience.wiley.com/rss/journal/118902517', 'http://www3.interscience.wiley.com/rss/journal/118902517', 'http://www3.interscience.wiley.com/rss/journal/118506135', 'http://www3.interscience.wiley.com/rss/journal/117964631', 'http://www.publish.csiro.au/RSS_Feed/CSIRO_Publishing_Recent_SB.xml', 'http://www.publish.csiro.au/RSS_Feed/CSIRO_Publishing_Recent_IS.xml', 'http://rss.sciencedirect.com/publication/science/6963', 'http://www.springerlink.com/content/0165-5752?sortorder=asc&export=rss', 'http://www.akademiai.com/content/jw080595p305/?sortorder=asc&export=rss', 'http://www.hindawi.com/journals/psyche/rss.xml', 'http://science.dec.wa.gov.au/nuytsia/nuytsia.rss.xml', 'http://pensoftonline.net/zookeys/index.php/journal/gateway/plugin/WebFeedGatewayPlugin/rss'); foreach ($feeds as $url) { $result = GetRSS($url, $rss, true); echo $result . "\n"; //exit(); if ($result == 0) { echo $rss; // Process $rss = str_replace("\n", '', $rss); $rss = str_replace("\r", '', $rss); // Clean up Zookeys ATOM feed if (preg_match('/<\\/feed><br(.*)$/', $rss)) { $rss = preg_replace('/<\\/feed><br(.*)$/', '</feed>', $rss); } // Clean up Zookeys RSS1 feed if (preg_match('/<\\/rdf:RDF><br(.*)$/', $rss)) { $rss = preg_replace('/<\\/rdf:RDF><br(.*)$/', '</rdf:RDF>', $rss); } // Clean up Zotero ATOM feed if (preg_match('/^Content-Type: application\\/atom\\+xml/', $rss)) { $rss = preg_replace('/^Content-Type: application\\/atom\\+xml/', '', $rss); } // Extract links (feed-type specific) $links = array(); $dom = new DOMDocument(); $dom->loadXML($rss); $xpath = new DOMXPath($dom); // Add namespaces to XPath to ensure our queries work $xpath->registerNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); $xpath->registerNamespace("annotate", "http://purl.org/rss/1.0/modules/annotate/"); $xpath->registerNamespace("content", "http://purl.org/rss/1.0/modules/content/"); $xpath->registerNamespace("rss", "http://purl.org/rss/1.0/"); $xpath->registerNamespace("slash", "http://purl.org/rss/1.0/modules/slash/"); $xpath->registerNamespace("dcterms", "http://purl.org/dc/terms/"); $xpath->registerNamespace("dc", "http://purl.org/dc/elements/1.1/"); $xpath->registerNamespace("atom", "http://www.w3.org/2005/Atom"); // Is it RSS 2.0? $xpath_query = "//rss/channel/item/link"; $nodeCollection = $xpath->query($xpath_query); foreach ($nodeCollection as $node) { array_push($links, $node->firstChild->nodeValue); } // Is it RSS 1.0? $xpath_query = "//rdf:RDF/rss:item/rss:link"; $nodeCollection = $xpath->query($xpath_query); foreach ($nodeCollection as $node) { array_push($links, $node->firstChild->nodeValue); } // Add to bioguid via OpenURL print_r($links); foreach ($links as $link) { $done = false; echo "Link={$link}\n"; // Journal-specific handling // Nuytsia is complicated as link is a database query that may return // multiple papers (e.g., same author may have > 1 paper in a volume if (preg_match('/http:\\/\\/science.dec.wa.gov.au\\/nuytsia\\//', $link)) { parse_nuytsia($link); $done = true; } if (!$done) { // Default, link is URL of a single article... $url = "http://bioguid.info/openurl?id=" . urlencode($link) . "&display=json"; $json = get($url); $obj = json_decode($json); print_r($obj); } } /* // Nope, is it RSS 1? if ($feed_title == '') { // RSS 1.0 $xpath_query = "//rss:channel/rss:title"; $nodeCollection = $xpath->query ($xpath_query); foreach($nodeCollection as $node) { $feed_title = $node->firstChild->nodeValue; $feed_type = 'rss'; $feed_version = 'RSS1'; $nc = $xpath->query ('//rss:channel/rss:description'); foreach($nc as $n) { $feed_description = $n->firstChild->nodeValue; } $nc = $xpath->query ('//rss:channel/rss:link'); foreach($nc as $n) { $feed_link = $n->firstChild->nodeValue; } } } // Nope, is it ATOM? if ($feed_title == '') { // Atom $xpath_query = "//atom:feed/atom:title"; $nodeCollection = $xpath->query ($xpath_query); foreach($nodeCollection as $node) { $feed_title = $node->firstChild->nodeValue; $feed_type = 'atom'; } $xpath_query = "//atom:feed/atom:subtitle"; $nodeCollection = $xpath->query ($xpath_query); foreach($nodeCollection as $node) { $feed_description = $node->firstChild->nodeValue; } // Link with rel="self" attribute // <link rel="self" href="http://api.flickr.com/services/feeds/groups_pool.gne?id=806927@N20&lang=en-us&format=atom" /> $xpath_query = "//atom:feed/atom:link[@rel='self']/@href"; $nodeCollection = $xpath->query ($xpath_query); foreach($nodeCollection as $node) { $feed_link = $node->firstChild->nodeValue; } // Link with no 'rel' attribute if ($feed_link == '') { $xpath_query = "//atom:feed/atom:link/@href"; $nodeCollection = $xpath->query ($xpath_query); foreach($nodeCollection as $node) { $feed_link = $node->firstChild->nodeValue; } } } */ } } }