function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } self::build_taxa_rank_array($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')); self::create_instances_from_taxon_object($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')); self::get_objects($harvester->process_row_type('http://eol.org/schema/media/Document')); self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')); self::get_agents($harvester->process_row_type('http://eol.org/schema/agent/Agent')); self::get_vernaculars($harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')); $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($temp_dir); echo "\n temporary directory removed: " . $temp_dir; print_r($this->debug); }
function export_gbif_to_eol($params) { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["dwca_file"], "meta.xml"); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["occurrence"] = $tables["http://rs.tdwg.org/dwc/terms/occurrence"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } /* $harvester->process_row_type() - this will convert rows into array. */ // $r = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/occurrence'); // $r = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Multimedia'); // print_r($r); exit; self::create_instances_from_taxon_object($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/occurrence')); self::get_media_objects($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Multimedia')); // self::get_objects($harvester->process_row_type('http://eol.org/schema/media/Document')); // self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')); // self::get_agents($harvester->process_row_type('http://eol.org/schema/agent/Agent')); // self::get_vernaculars($harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')); $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($temp_dir); echo "\n temporary directory removed: " . $temp_dir; }
function check_unique_ids($resource_id, $file_extension = ".tab") { $harvester = new ContentArchiveReader(NULL, CONTENT_RESOURCE_LOCAL_PATH . $resource_id . "/"); $tables = $harvester->tables; $tables = array_keys($tables); // $tables = array_diff($tables, array("http://rs.tdwg.org/dwc/terms/measurementorfact")); //exclude measurementorfact $tables = array_diff($tables, array("http://rs.gbif.org/terms/1.0/vernacularname")); //exclude vernacular name print_r($tables); foreach ($tables as $table) { self::process_fields($harvester->process_row_type($table), pathinfo($table, PATHINFO_BASENAME)); } }
function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } $row_types = self::get_XML_fields($paths["temp_dir"] . "meta.xml", "rowType"); /* manual assignment, use this if some referenceID in Measurements don't exist in References.tab $row_types = array("http://eol.org/schema/reference/Reference", "http://rs.tdwg.org/dwc/terms/Taxon", "http://rs.tdwg.org/dwc/terms/MeasurementOrFact", "http://rs.tdwg.org/dwc/terms/Occurrence"); */ print_r($row_types); foreach ($row_types as $row_type) { $basename = pathinfo($row_type, PATHINFO_BASENAME); if ($basename == "Taxon") { $allowed_fields = array("taxonID", "scientificName", "parentNameUsageID", "kingdom", "phylum", "class", "order", "family", "genus", "taxonRank", "furtherInformationURL", "taxonomicStatus", "taxonRemarks", "namePublishedIn", "referenceID"); } else { $allowed_fields = self::get_XML_fields($this->extensions[$basename], "property name"); } //manual adjustment if ($row_type == "VernacularName") { $allowed_fields[] = "taxonID"; } self::process_fields($harvester->process_row_type($row_type), $basename, $allowed_fields); // e.g. self::process_fields($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon'), "Taxon"); } $this->archive_builder->finalize(TRUE); recursive_rmdir($paths['temp_dir']); // remove temp dir echo "\n temporary directory removed: " . $paths['temp_dir']; print_r($this->debug); }
function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')) { self::get_references($records); } if ($records = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')) { $taxa_id_list = self::get_taxa_id_list($records); self::create_instances_from_taxon_object($records, $taxa_id_list); } if ($this->params["process occurrence"]) { echo "\nProcessed OCCURRENCE\n"; if ($records = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Occurrence')) { $this->uris = self::get_uris(); self::get_occurrences($records); } } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Distribution')) { self::get_distributions($records); } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Image')) { self::get_images($records); } //http://eol.org/content_partners/159/resources/345 if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Description')) { self::get_descriptions($records); } //http://eol.org/content_partners/159/resources/332 if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')) { self::get_vernaculars($records); } $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($paths['temp_dir']); echo "\n temporary directory removed: " . $paths['temp_dir']; print_r($this->debug); }
private function combine_all_temp_archives($batches, $resource_id) { $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . '/' . $resource_id . '_working/'; $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory)); for ($i = 1; $i <= $batches; $i++) { $dir = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . "_" . $i . "_working/"; $harvester = new ContentArchiveReader(NULL, $dir); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { return false; } // take note the index key is all lower case foreach (array_keys($tables) as $table) { self::process_fields($harvester->process_row_type($table), pathinfo($table, PATHINFO_BASENAME)); } /* debug - uncomment in normal operation //delete temp dir and file.tar.gz if(is_dir($dir)) recursive_rmdir($dir); $file = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . "_" . $i . "_working.tar.gz"; if(is_file($file)) unlink($file); */ } $this->archive_builder->finalize(TRUE); }
public function read_data() { $this->column_labels = array(); $this->column_indices = array(); foreach (new FileIterator($this->text_path["obis"]["ranges_OBIS"]) as $line_number => $line) { if ($line_number % 1000 == 0) { echo "{$line_number} :: " . time_elapsed() . " :: " . memory_get_usage() . "\n"; } $line_data = ContentArchiveReader::line_to_array($line, ",", "\""); if ($line_number == 0) { $this->column_labels = $line_data; foreach ($this->column_labels as $k => $v) { $this->column_indices[$v] = $k; } continue; } $this->process_line_data($line_data); } }
public static function get_all_taxa() { self::$MAPPINGS = self::assign_mappings(); $all_taxa = array(); $final_taxa = array(); $used_collection_ids = array(); $harvester = new ContentArchiveReader(NULL, DOC_ROOT . "temp/dwca_iabin"); $tables = $harvester->tables; if (!($GLOBALS['fields'] = $tables["http://www.pliniancore.org/plic/pcfcore/pliniancore2.3"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } $images = self::get_images($harvester->process_row_type('http://rs.gbif.org/terms/1.0/image')); $references = self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/reference')); $vernacular_names = self::get_vernacular_names($harvester->process_row_type('http://rs.gbif.org/terms/1.0/vernacularname')); $taxon_media = array(); $media = $harvester->process_row_type('http://www.pliniancore.org/plic/pcfcore/PlinianCore2.3'); foreach ($media as $m) { $taxon_id = $m['http://rs.tdwg.org/dwc/terms/taxonID']; @($taxon_media[$taxon_id][] = $m); } $taxa = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon'); $i = 0; $total = sizeof($taxa); foreach ($taxa as $taxon) { $i++; debug(" {$i} of {$total}"); $taxon_id = @$taxon['http://rs.tdwg.org/dwc/terms/taxonID']; $taxon["id"] = $taxon_id; $taxon["image"] = @$images[$taxon_id]; $taxon["reference"] = @$references[$taxon_id]; $taxon["vernacular_name"] = @$vernacular_names[$taxon_id]; $taxon["media"] = $taxon_media[$taxon_id]; $arr = self::get_iabin_taxa($taxon, $used_collection_ids); $page_taxa = $arr[0]; $used_collection_ids = $arr[1]; //do in batches to speed it up. if ($page_taxa) { $all_taxa = array_merge($all_taxa, $page_taxa); } if (count($all_taxa) == 1000) { $final_taxa = array_merge($final_taxa, $all_taxa); $all_taxa = array(); } } //last writes $final_taxa = array_merge($final_taxa, $all_taxa); return $final_taxa; }
public function build_archive() { $this->path_to_archive_directory = CONTENT_RESOURCE_LOCAL_PATH . "/{$this->resource_id}/"; $this->archive_builder = new \eol_schema\ContentArchiveBuilder(array('directory_path' => $this->path_to_archive_directory)); $this->column_labels = array(); $this->column_indices = array(); $this->taxon_occurrences = array(); foreach (new FileIterator(self::TAXA_URL) as $line_number => $line) { if ($line_number % 1000 == 0) { echo "{$line_number} :: " . time_elapsed() . " :: " . memory_get_usage() . "\n"; } $line_data = ContentArchiveReader::line_to_array($line, ",", "\""); if ($line_number == 0) { $this->column_labels = $line_data; foreach ($this->column_labels as $k => $v) { $this->column_indices[$v] = $k; } continue; } $taxon = $this->add_taxon($line_data); $this->taxon_occurrences[$taxon->taxonID] = $this->add_occurrence($line_data, $taxon); } $this->column_labels = array(); $this->column_indices = array(); foreach (new FileIterator(self::DUMP_URL) as $line_number => $line) { if ($line_number % 1000 == 0) { echo "{$line_number} :: " . time_elapsed() . " :: " . memory_get_usage() . "\n"; } $line_data = ContentArchiveReader::line_to_array($line, ",", "\""); if ($line_number == 0) { $this->column_labels = $line_data; foreach ($this->column_labels as $k => $v) { $this->column_indices[$v] = $k; } continue; } $this->process_line_data($line_data); } $this->archive_builder->finalize(true); }
function get_all_taxa($dwca_file) { self::$MAPPINGS = self::assign_mappings(); $all_taxa = array(); $used_collection_ids = array(); $paths = self::extract_archive_file($dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($GLOBALS['fields'] = $tables["http://www.pliniancore.org/plic/pcfcore/pliniancore2.3"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } $images = self::get_images($harvester->process_row_type('http://rs.gbif.org/terms/1.0/image')); $references = self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/reference')); $vernacular_names = self::get_vernacular_names($harvester->process_row_type('http://rs.gbif.org/terms/1.0/vernacularname')); $taxon_media = array(); $media = $harvester->process_row_type('http://www.pliniancore.org/plic/pcfcore/PlinianCore2.3'); foreach ($media as $m) { @($taxon_media[$m['http://rs.tdwg.org/dwc/terms/taxonID']] = $m); } $taxa = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon'); $i = 0; $total = sizeof($taxa); foreach ($taxa as $taxon) { $i++; debug("{$i} of {$total}"); $taxon_id = @$taxon['http://rs.tdwg.org/dwc/terms/taxonID']; $taxon["id"] = $taxon_id; $taxon["image"] = @$images[$taxon_id]; $taxon["reference"] = @$references[$taxon_id]; $taxon["vernacular_name"] = @$vernacular_names[$taxon_id]; $taxon["media"] = $taxon_media[$taxon_id]; $arr = self::get_inbio_taxa($taxon, $used_collection_ids); $page_taxa = $arr[0]; $used_collection_ids = $arr[1]; if ($page_taxa) { $all_taxa = array_merge($all_taxa, $page_taxa); } } // remove tmp dir if ($temp_dir) { shell_exec("rm -fr {$temp_dir}"); } return $all_taxa; }