Beispiel #1
0
 function save_dna_sequence_from_big_xml()
 {
     echo "\n\n saving dna sequence from big xml file...\n";
     // from 212.php this file will always be re-created
     require_library('connectors/BoldsImagesAPIv2');
     $func = new BoldsImagesAPIv2();
     $path = $func->download_and_extract_remote_file();
     echo "\n\n {$path}";
     $reader = new \XMLReader();
     $reader->open($path);
     $taxa_sequences = array();
     while (@$reader->read()) {
         if ($reader->nodeType == \XMLReader::ELEMENT && $reader->name == "record") {
             $string = $reader->readOuterXML();
             $xml = simplexml_load_string($string);
             $best_sequence = "";
             if (@$xml->sequences->sequence) {
                 if ($taxon_id = trim(@$xml->taxonomy->species->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->genus->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->subfamily->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->family->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->order->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->class->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->phylum->taxon->taxon_id)) {
                 } elseif ($taxon_id = trim(@$xml->taxonomy->kingdom->taxon->taxon_id)) {
                 }
                 $i = 0;
                 foreach (@$xml->sequences->sequence as $sequence) {
                     $i++;
                     if ($sequence->markercode == "COI-5P") {
                         if (strlen($best_sequence) < strlen($sequence->nucleotides)) {
                             $best_sequence = trim($sequence->nucleotides);
                         }
                     }
                 }
                 if ($best_sequence) {
                     if (@$taxa_sequences[$taxon_id]) {
                         $old = $taxa_sequences[$taxon_id]["s"];
                         if (strlen($old) < strlen($best_sequence)) {
                             $taxa_sequences[$taxon_id]["s"] = $best_sequence;
                         }
                         $taxa_sequences[$taxon_id]["c"] += $i;
                     } else {
                         $taxa_sequences[$taxon_id]["s"] = $best_sequence;
                         $taxa_sequences[$taxon_id]["c"] = $i;
                     }
                 }
             }
         }
     }
     self::save_to_json_file($taxa_sequences, $this->SAVED_SEQUENCES_FILE);
     unlink($path);
 }
Beispiel #2
0
 private function get_texts()
 {
     require_library('connectors/BoldsImagesAPIv2');
     $path = BoldsImagesAPIv2::download_and_extract_remote_file($this->original_resource);
     if ($xml = Functions::lookup_with_cache($path, array('timeout' => 172800, 'download_attempts' => 2, 'delay_in_minutes' => 3))) {
         $xml = simplexml_load_string($xml);
         $total = count($xml->taxon);
         $i = 0;
         foreach ($xml->taxon as $t) {
             $i++;
             echo "\n {$i} of {$total}";
             $do_count = sizeof($t->dataObject);
             if ($do_count > 0) {
                 $t_dwc = $t->children("http://rs.tdwg.org/dwc/dwcore/");
                 $t_dc = $t->children("http://purl.org/dc/elements/1.1/");
                 $taxonID = (string) trim($t_dc->identifier);
                 $source = self::clean_str("http://www.bioimages.org.uk/html/" . str_replace(" ", "_", Functions::canonical_form($t_dwc->ScientificName)) . ".htm");
                 //---------------------------------
                 $taxon = new \eol_schema\Taxon();
                 $taxon->taxonID = $taxonID;
                 $taxon->scientificName = $t_dwc->ScientificName;
                 $taxon->kingdom = $t_dwc->Kingdom;
                 $taxon->phylum = $t_dwc->Phylum;
                 $taxon->class = $t_dwc->Class;
                 $taxon->order = $t_dwc->Order;
                 $taxon->family = $t_dwc->Family;
                 $taxon->furtherInformationURL = $source;
                 echo "\n {$taxon->taxonID} - {$taxon->scientificName} [{$source}]";
                 if (isset($this->taxa[$taxonID])) {
                     echo " -- already exists";
                 } else {
                     $this->taxa[$taxonID] = $taxon;
                 }
                 //---------------------------------
                 foreach ($t->dataObject as $do) {
                     if ($do->dataType != "http://purl.org/dc/dcmitype/Text") {
                         continue;
                     }
                     $t_dc2 = $do->children("http://purl.org/dc/elements/1.1/");
                     $t_dcterms = $do->children("http://purl.org/dc/terms/");
                     //---------------------------
                     $agent_ids = array();
                     $r = new \eol_schema\Agent();
                     $r->term_name = $do->agent;
                     $r->identifier = md5("{$do->agent}|{$do->agent}['role']");
                     $r->agentRole = $do->agent['role'];
                     $r->term_homepage = "http://www.bioimages.org.uk/index.htm";
                     $agent_ids[] = $r->identifier;
                     if (!in_array($r->identifier, $this->resource_agent_ids)) {
                         $this->resource_agent_ids[] = $r->identifier;
                         $this->archive_builder->write_object_to_file($r);
                     }
                     //---------------------------
                     $text_identifier = self::clean_str($t_dc2->identifier);
                     if (in_array($text_identifier, $this->media_ids)) {
                         continue;
                     } else {
                         $this->media_ids[] = $text_identifier;
                     }
                     $mr = new \eol_schema\MediaResource();
                     if ($agent_ids) {
                         $mr->agentID = implode("; ", $agent_ids);
                     }
                     $mr->taxonID = $taxonID;
                     $mr->identifier = $text_identifier;
                     $mr->type = (string) "http://purl.org/dc/dcmitype/Text";
                     //$do->dataType;
                     $mr->language = "en";
                     $mr->format = "text/html";
                     //$do->mimeType;
                     $mr->furtherInformationURL = (string) trim($source);
                     /* very long text objects, temporarily ignored */
                     $problematic_objects = array("http://www.bioimages.org.uk/html/Betula.htm", "http://www.bioimages.org.uk/html/Broadleaved_trees.htm", "http://www.bioimages.org.uk/html/Fagus.htm", "http://www.bioimages.org.uk/html/Pinopsida.htm", "http://www.bioimages.org.uk/html/Poaceae.htm", "http://www.bioimages.org.uk/html/Quercus.htm", "http://www.bioimages.org.uk/html/Salix.htm", "http://www.bioimages.org.uk/html/Trees.htm");
                     if (in_array($mr->furtherInformationURL, $problematic_objects)) {
                         continue;
                     }
                     $mr->CVterm = "http://rs.tdwg.org/ontology/voc/SPMInfoItems#Associations";
                     $mr->Owner = "BioImages";
                     $mr->title = "Associations";
                     $mr->UsageTerms = "http://creativecommons.org/licenses/by-nc-sa/3.0/";
                     // $mr->audience       = 'Everyone';
                     // $mr->accessURI      = $source;
                     $description = (string) $t_dc2->description;
                     $description = trim(self::clean_str(utf8_encode($description)));
                     if (!$description) {
                         continue;
                     } else {
                         $mr->description = $description;
                         $this->archive_builder->write_object_to_file($mr);
                     }
                 }
             }
         }
     } else {
         echo "\n Down: " . $this->original_resource;
     }
     unlink($path);
     echo "\n temporary XML file removed: [{$path}]\n";
 }
Beispiel #3
0
namespace php_active_record;

/* connector for BOLDS --- higher-level taxa
estimated execution time: 15,33,30,41 hours for slow connection
                        : 4.8 hours if API requests are already cached
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
require_library('connectors/BoldsAPI');
$folder = DOC_ROOT . "update_resources/connectors/files/BOLD";
if (!file_exists($folder)) {
    mkdir($folder, 0777);
}
// this will generate the: DOC_ROOT . "/update_resources/connectors/files/BOLD/hl_master_list.txt"
require_library('connectors/BoldsImagesAPIv2');
$func = new BoldsImagesAPIv2();
$func->generate_higher_level_taxa_list();
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "\n elapsed time = " . $elapsed_time_sec / 60 . " minutes";
echo "\n elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours";
echo "\n Done generate_higher_level_taxa_list() \n\n";
if (isset($argv[1])) {
    $call_multiple_instance = false;
} else {
    $call_multiple_instance = true;
}
$resource_id = 81;
$bolds = new BoldsAPI();
$bolds->initialize_text_files();
$bolds->start_process($resource_id, false);
Beispiel #4
0
<?php

namespace php_active_record;

/* This is BOLDS 2nd image resource, gets the data from BOLDS big XML file but excluding those images from the original image resource.
estimated execution time: 7.5 hours

as of 5-Aug-2014
http://rs.tdwg.org/dwc/terms/taxon: Total: 99728
http://eol.org/schema/agent/agent:  Total: 119
http://purl.org/dc/dcmitype/StillImage: 90982
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
require_library('connectors/BoldsImagesAPIv2');
$timestart = time_elapsed();
$resource_id = 546;
$func = new BoldsImagesAPIv2($resource_id);
$func->get_all_taxa();
Functions::finalize_dwca_resource($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n\n elapsed time = " . $elapsed_time_sec / 60 . " minutes";
echo "\n elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours";
echo "\n Done processing.\n";