function save_dna_sequence_from_big_xml() { echo "\n\n saving dna sequence from big xml file...\n"; // from 212.php this file will always be re-created require_library('connectors/BoldsImagesAPIv2'); $func = new BoldsImagesAPIv2(); $path = $func->download_and_extract_remote_file(); echo "\n\n {$path}"; $reader = new \XMLReader(); $reader->open($path); $taxa_sequences = array(); while (@$reader->read()) { if ($reader->nodeType == \XMLReader::ELEMENT && $reader->name == "record") { $string = $reader->readOuterXML(); $xml = simplexml_load_string($string); $best_sequence = ""; if (@$xml->sequences->sequence) { if ($taxon_id = trim(@$xml->taxonomy->species->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->genus->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->subfamily->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->family->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->order->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->class->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->phylum->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->kingdom->taxon->taxon_id)) { } $i = 0; foreach (@$xml->sequences->sequence as $sequence) { $i++; if ($sequence->markercode == "COI-5P") { if (strlen($best_sequence) < strlen($sequence->nucleotides)) { $best_sequence = trim($sequence->nucleotides); } } } if ($best_sequence) { if (@$taxa_sequences[$taxon_id]) { $old = $taxa_sequences[$taxon_id]["s"]; if (strlen($old) < strlen($best_sequence)) { $taxa_sequences[$taxon_id]["s"] = $best_sequence; } $taxa_sequences[$taxon_id]["c"] += $i; } else { $taxa_sequences[$taxon_id]["s"] = $best_sequence; $taxa_sequences[$taxon_id]["c"] = $i; } } } } } self::save_to_json_file($taxa_sequences, $this->SAVED_SEQUENCES_FILE); unlink($path); }
private function get_texts() { require_library('connectors/BoldsImagesAPIv2'); $path = BoldsImagesAPIv2::download_and_extract_remote_file($this->original_resource); if ($xml = Functions::lookup_with_cache($path, array('timeout' => 172800, 'download_attempts' => 2, 'delay_in_minutes' => 3))) { $xml = simplexml_load_string($xml); $total = count($xml->taxon); $i = 0; foreach ($xml->taxon as $t) { $i++; echo "\n {$i} of {$total}"; $do_count = sizeof($t->dataObject); if ($do_count > 0) { $t_dwc = $t->children("http://rs.tdwg.org/dwc/dwcore/"); $t_dc = $t->children("http://purl.org/dc/elements/1.1/"); $taxonID = (string) trim($t_dc->identifier); $source = self::clean_str("http://www.bioimages.org.uk/html/" . str_replace(" ", "_", Functions::canonical_form($t_dwc->ScientificName)) . ".htm"); //--------------------------------- $taxon = new \eol_schema\Taxon(); $taxon->taxonID = $taxonID; $taxon->scientificName = $t_dwc->ScientificName; $taxon->kingdom = $t_dwc->Kingdom; $taxon->phylum = $t_dwc->Phylum; $taxon->class = $t_dwc->Class; $taxon->order = $t_dwc->Order; $taxon->family = $t_dwc->Family; $taxon->furtherInformationURL = $source; echo "\n {$taxon->taxonID} - {$taxon->scientificName} [{$source}]"; if (isset($this->taxa[$taxonID])) { echo " -- already exists"; } else { $this->taxa[$taxonID] = $taxon; } //--------------------------------- foreach ($t->dataObject as $do) { if ($do->dataType != "http://purl.org/dc/dcmitype/Text") { continue; } $t_dc2 = $do->children("http://purl.org/dc/elements/1.1/"); $t_dcterms = $do->children("http://purl.org/dc/terms/"); //--------------------------- $agent_ids = array(); $r = new \eol_schema\Agent(); $r->term_name = $do->agent; $r->identifier = md5("{$do->agent}|{$do->agent}['role']"); $r->agentRole = $do->agent['role']; $r->term_homepage = "http://www.bioimages.org.uk/index.htm"; $agent_ids[] = $r->identifier; if (!in_array($r->identifier, $this->resource_agent_ids)) { $this->resource_agent_ids[] = $r->identifier; $this->archive_builder->write_object_to_file($r); } //--------------------------- $text_identifier = self::clean_str($t_dc2->identifier); if (in_array($text_identifier, $this->media_ids)) { continue; } else { $this->media_ids[] = $text_identifier; } $mr = new \eol_schema\MediaResource(); if ($agent_ids) { $mr->agentID = implode("; ", $agent_ids); } $mr->taxonID = $taxonID; $mr->identifier = $text_identifier; $mr->type = (string) "http://purl.org/dc/dcmitype/Text"; //$do->dataType; $mr->language = "en"; $mr->format = "text/html"; //$do->mimeType; $mr->furtherInformationURL = (string) trim($source); /* very long text objects, temporarily ignored */ $problematic_objects = array("http://www.bioimages.org.uk/html/Betula.htm", "http://www.bioimages.org.uk/html/Broadleaved_trees.htm", "http://www.bioimages.org.uk/html/Fagus.htm", "http://www.bioimages.org.uk/html/Pinopsida.htm", "http://www.bioimages.org.uk/html/Poaceae.htm", "http://www.bioimages.org.uk/html/Quercus.htm", "http://www.bioimages.org.uk/html/Salix.htm", "http://www.bioimages.org.uk/html/Trees.htm"); if (in_array($mr->furtherInformationURL, $problematic_objects)) { continue; } $mr->CVterm = "http://rs.tdwg.org/ontology/voc/SPMInfoItems#Associations"; $mr->Owner = "BioImages"; $mr->title = "Associations"; $mr->UsageTerms = "http://creativecommons.org/licenses/by-nc-sa/3.0/"; // $mr->audience = 'Everyone'; // $mr->accessURI = $source; $description = (string) $t_dc2->description; $description = trim(self::clean_str(utf8_encode($description))); if (!$description) { continue; } else { $mr->description = $description; $this->archive_builder->write_object_to_file($mr); } } } } } else { echo "\n Down: " . $this->original_resource; } unlink($path); echo "\n temporary XML file removed: [{$path}]\n"; }
namespace php_active_record; /* connector for BOLDS --- higher-level taxa estimated execution time: 15,33,30,41 hours for slow connection : 4.8 hours if API requests are already cached */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); require_library('connectors/BoldsAPI'); $folder = DOC_ROOT . "update_resources/connectors/files/BOLD"; if (!file_exists($folder)) { mkdir($folder, 0777); } // this will generate the: DOC_ROOT . "/update_resources/connectors/files/BOLD/hl_master_list.txt" require_library('connectors/BoldsImagesAPIv2'); $func = new BoldsImagesAPIv2(); $func->generate_higher_level_taxa_list(); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "\n elapsed time = " . $elapsed_time_sec / 60 . " minutes"; echo "\n elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours"; echo "\n Done generate_higher_level_taxa_list() \n\n"; if (isset($argv[1])) { $call_multiple_instance = false; } else { $call_multiple_instance = true; } $resource_id = 81; $bolds = new BoldsAPI(); $bolds->initialize_text_files(); $bolds->start_process($resource_id, false);
<?php namespace php_active_record; /* This is BOLDS 2nd image resource, gets the data from BOLDS big XML file but excluding those images from the original image resource. estimated execution time: 7.5 hours as of 5-Aug-2014 http://rs.tdwg.org/dwc/terms/taxon: Total: 99728 http://eol.org/schema/agent/agent: Total: 119 http://purl.org/dc/dcmitype/StillImage: 90982 */ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('connectors/BoldsImagesAPIv2'); $timestart = time_elapsed(); $resource_id = 546; $func = new BoldsImagesAPIv2($resource_id); $func->get_all_taxa(); Functions::finalize_dwca_resource($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n\n elapsed time = " . $elapsed_time_sec / 60 . " minutes"; echo "\n elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours"; echo "\n Done processing.\n";