Пример #1
0
 function Harvest()
 {
     $xml = get($this->url);
     //echo $xml;
     // Convert Mycobank RSS to JSON
     $xp = new XsltProcessor();
     $xsl = new DomDocument();
     $xsl->load('xsl/rss2.xsl');
     $xp->importStylesheet($xsl);
     // replace carriage returns and end of lines, which break JSON
     $xml = str_replace("\n", " ", $xml);
     $xml = str_replace("\r", " ", $xml);
     $xml_doc = new DOMDocument();
     $xml_doc->loadXML($xml);
     $json = $xp->transformToXML($xml_doc);
     //echo $json;
     $obj = json_decode($json);
     //print_r($obj);
     // Extract details
     foreach ($obj->items as $i) {
         $item = new stdclass();
         $item->links = array();
         $item->link = $i->link;
         $item->description = $i->description;
         $item->id = $i->link;
         // Mycobank ID
         $id = $i->link;
         $id = str_replace('http://www.mycobank.org/MycoTaxo.aspx?Link=T&Rec=', '', $id);
         // Dates
         if (preg_match('/<br>Date of deposit:\\s*(?<date>(.*))<br>/', $i->description, $matches)) {
             //print_r($matches);
             $item->created = date("Y-m-d H:i:s", strtotime($matches['date']));
             $item->updated = $item->created;
         }
         $have_literature = false;
         // Taxon name
         if (preg_match('/<a href=(.*)><i>(?<name>(.*))<\\/i><\\/a>/', $i->description, $matches)) {
             $item->title = $matches['name'];
             //print_r($matches);
             // Index Fungorum search...
             $url = 'http://www.indexfungorum.org/IXFWebService/Fungus.asmx/NameSearch?SearchText=' . str_replace(' ', '%20', $matches['name']) . '&AnywhereInText=false&MaxNumber=1';
             //echo $url . "\n";
             $xml = get($url);
             //echo $xml;
             $dom = new DOMDocument();
             $dom->loadXML($xml);
             $xpath = new DOMXPath($dom);
             $record_id = 0;
             $nodeCollection = $xpath->query("//NewDataSet/IndexFungorum/RECORD_x0020_NUMBER");
             foreach ($nodeCollection as $node) {
                 // We have this name in Index Fungorum...
                 $record_id = $node->firstChild->nodeValue;
                 $lsid = 'urn:lsid:indexfungorum.org:names:' . $record_id;
                 // Store LSID
                 array_push($item->links, array('lsid' => $lsid));
                 $item->description .= '<br/><a href="http://bioguid.info/' . $lsid . '">' . $lsid . '</a>';
                 // Get bibliographic details
                 $rdf = ResolveGuid($lsid);
                 //echo $rdf;
                 $url = 'http://bioguid.info/openurl?genre=article';
                 $d = new DOMDocument();
                 $d->loadXML($rdf);
                 $xpath = new DOMXPath($d);
                 $n = $xpath->query("//tpub:title");
                 foreach ($n as $n2) {
                     $url .= '&title=' . $n2->firstChild->nodeValue;
                 }
                 $n = $xpath->query("//tpub:volume");
                 foreach ($n as $n2) {
                     $url .= '&volume=' . $n2->firstChild->nodeValue;
                 }
                 $n = $xpath->query("//tpub:pages");
                 foreach ($n as $n2) {
                     $url .= '&pages=' . $n2->firstChild->nodeValue;
                 }
                 $n = $xpath->query("//tpub:year");
                 foreach ($n as $n2) {
                     $url .= '&date=' . $n2->firstChild->nodeValue;
                 }
                 $url .= '&display=json';
                 //echo $url;
                 $ref = json_decode(get($url));
                 if ($ref->status == 'ok') {
                     $have_literature = true;
                     if (isset($ref->doi)) {
                         array_push($item->links, array('doi' => $ref->doi));
                         $item->description .= '<br/><a href="http://dx.doi.org/' . $ref->doi . '">doi:' . $ref->doi . '</a>';
                     }
                     if (isset($ref->pmid)) {
                         array_push($item->links, array('pmid' => $ref->pmid));
                     }
                     if (isset($ref->hdl)) {
                         array_push($item->links, array('hdl' => $ref->hdl));
                         $item->description .= '<br/><a href="http://hdl.handle.net/' . $ref->hdl . '">doi:' . $ref->hdl . '</a>';
                     }
                     if (isset($ref->url)) {
                         array_push($item->links, array('url' => $ref->url));
                         $item->description .= '<br/><a href="' . $ref->url . '">' . $ref->url . '</a>';
                     }
                 } else {
                     // No joy...
                 }
             }
             /*				if (!$have_literature)
             				{
             					// Get from Mycobank, doesn't seem to work when I try and havest with CURL :(
             					echo "Get lit\n";
             					
             					// Get literature from Mycobank
             					
             					// 1. Get Mycobank web page
             					$url = $item->link;
             					$html = get($url);
             					
             					echo "html=" . $html . "\n";
             					echo "link=" . $item->link . "\n";
             					
             					if (preg_match("/<A href='MycoBiblio.aspx\?Link=T&Rec=(.*)'\s+/", $html, $matches))
             					{
             						print_r($matches);
             					}					
             					
             					
             				}
             */
             $this->StoreFeedItem($item);
         }
     }
 }
Пример #2
0
    function Harvest()
    {
        global $debug;
        //echo "|" . $this->url . "|";
        //$html = get($this->url);
        //echo $html;
        $url = 'http://www.ipni.org/ipni/advPlantNameSearch.do?find_family=' . $this->title . '&find_genus=&find_species=&find_infrafamily=&find_infragenus=&find_infraspecies=&find_authorAbbrev=&find_includePublicationAuthors=on&find_includePublicationAuthors=off&find_includeBasionymAuthors=on&find_includeBasionymAuthors=off&find_publicationTitle=&show_extras=on&find_geoUnit=&find_addedSince=' . ($d = date("Y-m-d", strtotime("now - 2 months")) . '&find_modifiedSince=&find_isAPNIRecord=on&find_isAPNIRecord=false&find_isGCIRecord=on&find_isGCIRecord=false&find_isIKRecord=on&find_isIKRecord=false&find_rankToReturn=all&output_format=delimited-minimal&find_sortByFamily=on&find_sortByFamily=off&query_type=by_query&back_page=plantsearch');
        //echo $url;
        $text = 'Id%Version%Family%Full name without family and authors%Authors
77096980-1%1.2%Begoniaceae%Begonia hekensis%D.C.Thomas
77097937-1%1.1%Begoniaceae%Begonia mysteriosa%L.Kollmann & A.P.Fontana
77096979-1%1.1%Begoniaceae%Begonia ozotothrix%D.C.Thomas';
        $text = get($url);
        $text = trim($text);
        /*$text='Id%Version%Family%Full name without family and authors%Authors
        60451177-2%1.1%Euphorbiaceae%Croton subgen. Geiseleria%(Klotzsch) A.Gray
        77097911-1%1.1%Euphorbiaceae%Croton pallidulus var. glabrus%L.R.Lima
        77097476-1%1.1%Euphorbiaceae%Euphorbia confinalis subsp. rhodesiaca%L.C.Leach
        77097491-1%1.1%Euphorbiaceae%Euphorbia maryrichardsiae%G.Will.
        77098208-1%1.2%Euphorbiaceae%Euphorbia ohiva%Swanepoel
        60451526-2%1.1.2.1%Euphorbiaceae%Luntia%Neck. ex Raf.';	*/
        if ($debug) {
            echo $url . "\n";
            echo $text . "\n";
        }
        // Get array of individual lines
        $lines = explode("\n", $text);
        // Extract headings from first line
        $parts = explode("%", $lines[0]);
        $size = count($parts);
        $heading = array();
        for ($i = 0; $i < $size; $i++) {
            $heading[$parts[$i]] = $i;
        }
        // Read each remaining line
        $size = count($lines);
        for ($i = 1; $i < $size; $i++) {
            $parts = explode("%", $lines[$i]);
            //print_r($parts);
            $item = new stdclass();
            //Add elements to the feed item
            $lsid = 'urn:lsid:ipni.org:names:' . $parts[$heading["Id"]];
            $item->title = $parts[$heading["Full name without family and authors"]];
            $item->id = $lsid;
            $item->link = 'http://www.ipni.org/ipni/idPlantNameSearch.do?id=' . $parts[$heading["Id"]];
            $item->description = '<i>' . $parts[$heading["Full name without family and authors"]] . '</i> ' . $parts[$heading["Authors"]];
            $item->description = str_replace('subsp.', '</i>subsp.<i>', $item->description);
            $item->description = str_replace('var.', '</i>var.<i>', $item->description);
            // Identifiers
            $item->links = array();
            // tag
            //  $parts[$heading["Family"]]
            // retrieve metadata...
            $rdf = ResolveGuid($lsid);
            // Fix IPNI bug
            $rdf = preg_replace('/ & /', ' &amp; ', $rdf);
            if ($debug) {
                echo $rdf;
            }
            //echo $rdf;
            // extract extra details...
            $dom = new DOMDocument();
            $dom->loadXML($rdf);
            $xpath = new DOMXPath($dom);
            $nodeCollection = $xpath->query("//tcom:publishedIn");
            foreach ($nodeCollection as $node) {
                $publishedIn = $node->firstChild->nodeValue;
                $item->description .= '<br/>' . $publishedIn;
                // Can we get any GUIDs for this...?
                $matches = array();
                if (parse_ipni_ref($publishedIn, $matches)) {
                    //print_r($matches);
                    // we parsed it OK, now find guid...
                    $url = 'http://bioguid.info/openurl/?genre=article';
                    $url .= '&title=' . urlencode($matches['journal']);
                    $url .= '&volume=' . $matches['volume'];
                    $url .= '&pages=' . $matches['page'];
                    $url .= '&display=json';
                    //echo $url;
                    $j = get($url);
                    $ref = json_decode($j);
                    //print_r($ref);
                    if ($ref->status == 'ok') {
                        if (isset($ref->doi)) {
                            array_push($item->links, array('doi' => $ref->doi));
                            $item->description .= '<br/><a href="http://dx.doi.org/' . $ref->doi . '">doi:' . $ref->doi . '</a>';
                        }
                        if (isset($ref->pmid)) {
                            array_push($item->links, array('pmid' => $ref->pmid));
                        }
                        if (isset($ref->hdl)) {
                            array_push($item->links, array('hdl' => $ref->hdl));
                            $item->description .= '<br/><a href="http://hdl.handle.net/' . $ref->hdl . '">doi:' . $ref->hdl . '</a>';
                        }
                        if (isset($ref->url)) {
                            array_push($item->links, array('url' => $ref->url));
                            $item->description .= '<br/><a href="' . $ref->url . '">' . $ref->url . '</a>';
                        }
                    } else {
                        // No guid found, but we did parse it OK...
                    }
                } else {
                    // Don't understand this reference at all...
                }
            }
            $nodeCollection = $xpath->query("//dcterms:created");
            foreach ($nodeCollection as $node) {
                $item->created = $node->firstChild->nodeValue;
            }
            $nodeCollection = $xpath->query("//dcterms:modified");
            foreach ($nodeCollection as $node) {
                $item->updated = $node->firstChild->nodeValue;
            }
            //print_r($item);
            $this->StoreFeedItem($item);
        }
    }