function save_dna_sequence_from_big_xml() { echo "\n\n saving dna sequence from big xml file...\n"; // from 212.php this file will always be re-created require_library('connectors/BoldsImagesAPIv2'); $func = new BoldsImagesAPIv2(); $path = $func->download_and_extract_remote_file(); echo "\n\n {$path}"; $reader = new \XMLReader(); $reader->open($path); $taxa_sequences = array(); while (@$reader->read()) { if ($reader->nodeType == \XMLReader::ELEMENT && $reader->name == "record") { $string = $reader->readOuterXML(); $xml = simplexml_load_string($string); $best_sequence = ""; if (@$xml->sequences->sequence) { if ($taxon_id = trim(@$xml->taxonomy->species->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->genus->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->subfamily->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->family->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->order->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->class->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->phylum->taxon->taxon_id)) { } elseif ($taxon_id = trim(@$xml->taxonomy->kingdom->taxon->taxon_id)) { } $i = 0; foreach (@$xml->sequences->sequence as $sequence) { $i++; if ($sequence->markercode == "COI-5P") { if (strlen($best_sequence) < strlen($sequence->nucleotides)) { $best_sequence = trim($sequence->nucleotides); } } } if ($best_sequence) { if (@$taxa_sequences[$taxon_id]) { $old = $taxa_sequences[$taxon_id]["s"]; if (strlen($old) < strlen($best_sequence)) { $taxa_sequences[$taxon_id]["s"] = $best_sequence; } $taxa_sequences[$taxon_id]["c"] += $i; } else { $taxa_sequences[$taxon_id]["s"] = $best_sequence; $taxa_sequences[$taxon_id]["c"] = $i; } } } } } self::save_to_json_file($taxa_sequences, $this->SAVED_SEQUENCES_FILE); unlink($path); }
private function get_texts() { require_library('connectors/BoldsImagesAPIv2'); $path = BoldsImagesAPIv2::download_and_extract_remote_file($this->original_resource); if ($xml = Functions::lookup_with_cache($path, array('timeout' => 172800, 'download_attempts' => 2, 'delay_in_minutes' => 3))) { $xml = simplexml_load_string($xml); $total = count($xml->taxon); $i = 0; foreach ($xml->taxon as $t) { $i++; echo "\n {$i} of {$total}"; $do_count = sizeof($t->dataObject); if ($do_count > 0) { $t_dwc = $t->children("http://rs.tdwg.org/dwc/dwcore/"); $t_dc = $t->children("http://purl.org/dc/elements/1.1/"); $taxonID = (string) trim($t_dc->identifier); $source = self::clean_str("http://www.bioimages.org.uk/html/" . str_replace(" ", "_", Functions::canonical_form($t_dwc->ScientificName)) . ".htm"); //--------------------------------- $taxon = new \eol_schema\Taxon(); $taxon->taxonID = $taxonID; $taxon->scientificName = $t_dwc->ScientificName; $taxon->kingdom = $t_dwc->Kingdom; $taxon->phylum = $t_dwc->Phylum; $taxon->class = $t_dwc->Class; $taxon->order = $t_dwc->Order; $taxon->family = $t_dwc->Family; $taxon->furtherInformationURL = $source; echo "\n {$taxon->taxonID} - {$taxon->scientificName} [{$source}]"; if (isset($this->taxa[$taxonID])) { echo " -- already exists"; } else { $this->taxa[$taxonID] = $taxon; } //--------------------------------- foreach ($t->dataObject as $do) { if ($do->dataType != "http://purl.org/dc/dcmitype/Text") { continue; } $t_dc2 = $do->children("http://purl.org/dc/elements/1.1/"); $t_dcterms = $do->children("http://purl.org/dc/terms/"); //--------------------------- $agent_ids = array(); $r = new \eol_schema\Agent(); $r->term_name = $do->agent; $r->identifier = md5("{$do->agent}|{$do->agent}['role']"); $r->agentRole = $do->agent['role']; $r->term_homepage = "http://www.bioimages.org.uk/index.htm"; $agent_ids[] = $r->identifier; if (!in_array($r->identifier, $this->resource_agent_ids)) { $this->resource_agent_ids[] = $r->identifier; $this->archive_builder->write_object_to_file($r); } //--------------------------- $text_identifier = self::clean_str($t_dc2->identifier); if (in_array($text_identifier, $this->media_ids)) { continue; } else { $this->media_ids[] = $text_identifier; } $mr = new \eol_schema\MediaResource(); if ($agent_ids) { $mr->agentID = implode("; ", $agent_ids); } $mr->taxonID = $taxonID; $mr->identifier = $text_identifier; $mr->type = (string) "http://purl.org/dc/dcmitype/Text"; //$do->dataType; $mr->language = "en"; $mr->format = "text/html"; //$do->mimeType; $mr->furtherInformationURL = (string) trim($source); /* very long text objects, temporarily ignored */ $problematic_objects = array("http://www.bioimages.org.uk/html/Betula.htm", "http://www.bioimages.org.uk/html/Broadleaved_trees.htm", "http://www.bioimages.org.uk/html/Fagus.htm", "http://www.bioimages.org.uk/html/Pinopsida.htm", "http://www.bioimages.org.uk/html/Poaceae.htm", "http://www.bioimages.org.uk/html/Quercus.htm", "http://www.bioimages.org.uk/html/Salix.htm", "http://www.bioimages.org.uk/html/Trees.htm"); if (in_array($mr->furtherInformationURL, $problematic_objects)) { continue; } $mr->CVterm = "http://rs.tdwg.org/ontology/voc/SPMInfoItems#Associations"; $mr->Owner = "BioImages"; $mr->title = "Associations"; $mr->UsageTerms = "http://creativecommons.org/licenses/by-nc-sa/3.0/"; // $mr->audience = 'Everyone'; // $mr->accessURI = $source; $description = (string) $t_dc2->description; $description = trim(self::clean_str(utf8_encode($description))); if (!$description) { continue; } else { $mr->description = $description; $this->archive_builder->write_object_to_file($mr); } } } } } else { echo "\n Down: " . $this->original_resource; } unlink($path); echo "\n temporary XML file removed: [{$path}]\n"; }