function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } self::build_taxa_rank_array($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')); self::create_instances_from_taxon_object($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')); self::get_objects($harvester->process_row_type('http://eol.org/schema/media/Document')); self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')); self::get_agents($harvester->process_row_type('http://eol.org/schema/agent/Agent')); self::get_vernaculars($harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')); $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($temp_dir); echo "\n temporary directory removed: " . $temp_dir; print_r($this->debug); }
function export_gbif_to_eol($params) { $this->uris = self::get_uris($params); require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["dwca_file"], "meta.xml", array("timeout" => 7200, "expire_seconds" => false)); // "expire_seconds" -- false => won't expire; 0 => expires now $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $this->harvester = new ContentArchiveReader(NULL, $archive_path); if (!@$this->harvester->tables["http://rs.tdwg.org/dwc/terms/occurrence"][0]->fields) { debug("Invalid archive file. Program will terminate."); return false; } if ($params["dataset"] == "NMNH") { self::process_row_type($params); } elseif ($params["dataset"] == "NHM") { self::process_row_type_from_NHM($temp_dir . "/" . $params['location']); } $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($temp_dir); echo "\n temporary directory removed: " . $temp_dir; print_r($this->debug); }
function export_gbif_to_eol($params) { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["dwca_file"], "meta.xml"); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["occurrence"] = $tables["http://rs.tdwg.org/dwc/terms/occurrence"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } /* $harvester->process_row_type() - this will convert rows into array. */ // $r = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/occurrence'); // $r = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Multimedia'); // print_r($r); exit; self::create_instances_from_taxon_object($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/occurrence')); self::get_media_objects($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Multimedia')); // self::get_objects($harvester->process_row_type('http://eol.org/schema/media/Document')); // self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')); // self::get_agents($harvester->process_row_type('http://eol.org/schema/agent/Agent')); // self::get_vernaculars($harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')); $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($temp_dir); echo "\n temporary directory removed: " . $temp_dir; }
function process_xml($params) { $all_taxa = self::get_taxa_list_from_biogeodb(); $this->taxa_id_list = array_merge($all_taxa, self::get_taxa_list_from_myspecies()); require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["eol_xml_file"], $params["filename"], array("timeout" => 7200, "expire_seconds" => false)); print_r($paths); $params["path"] = $paths["temp_dir"]; $xml = self::update_xml($params); recursive_rmdir($paths["temp_dir"]); // remove temp dir return $xml; }
function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')) { self::get_references($records); } if ($records = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon')) { $taxa_id_list = self::get_taxa_id_list($records); self::create_instances_from_taxon_object($records, $taxa_id_list); } if ($this->params["process occurrence"]) { echo "\nProcessed OCCURRENCE\n"; if ($records = $harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Occurrence')) { $this->uris = self::get_uris(); self::get_occurrences($records); } } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Distribution')) { self::get_distributions($records); } if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Image')) { self::get_images($records); } //http://eol.org/content_partners/159/resources/345 if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/Description')) { self::get_descriptions($records); } //http://eol.org/content_partners/159/resources/332 if ($records = $harvester->process_row_type('http://rs.gbif.org/terms/1.0/VernacularName')) { self::get_vernaculars($records); } $this->archive_builder->finalize(TRUE); // remove temp dir recursive_rmdir($paths['temp_dir']); echo "\n temporary directory removed: " . $paths['temp_dir']; print_r($this->debug); }
function clean_media_extension($resource_id, $dwca_file) { require_library('connectors/INBioAPI'); $func = new INBioAPI(); if ($paths = $func->extract_archive_file($dwca_file, "meta.xml")) { print_r($paths); if ($contents = Functions::get_remote_file($paths['archive_path'] . "media.txt", array('timeout' => 172800))) { $contents = str_ireplace('<a title=""', '<a title="', $contents); $contents = str_ireplace('"" href=""', '" href="', $contents); $contents = str_ireplace('"">', '">', $contents); //saving new media.txt if (!($WRITE = fopen($paths['archive_path'] . "media.txt", "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $paths['archive_path'] . "media.txt"); return; } fwrite($WRITE, $contents); fclose($WRITE); // remove the archive file e.g. plazi.zip $info = pathinfo($dwca_file); unlink($paths['archive_path'] . $info["basename"]); // creating the archive file $command_line = "tar -czf " . CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".tar.gz --directory=" . $paths['archive_path'] . " ."; $output = shell_exec($command_line); // moving files to /resources/ recursive_rmdir(CONTENT_RESOURCE_LOCAL_PATH . $resource_id); if (!file_exists(CONTENT_RESOURCE_LOCAL_PATH . $resource_id)) { mkdir(CONTENT_RESOURCE_LOCAL_PATH . $resource_id); } $src = $paths['archive_path']; $dst = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . "/"; $files = glob($paths['archive_path'] . "*.*"); foreach ($files as $file) { $file_to_go = str_replace($src, $dst, $file); copy($file, $file_to_go); } } // remove temp dir recursive_rmdir($paths['archive_path']); echo "\n temporary directory removed: " . $paths['archive_path']; } }
function export_gbif_to_eol($params) { $this->uris = self::get_uris($params, $params["uri_file"]); $params["uri_type"] = "citation"; if ($file = @$params["citation_file"]) { $this->citations = self::get_uris($params, $file); } require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["dwca_file"], "meta.xml", array("timeout" => 7200, "expire_seconds" => 0)); $archive_path = $paths['archive_path']; $temp_dir = $paths['temp_dir']; $this->harvester = new ContentArchiveReader(NULL, $archive_path); if (!@$this->harvester->tables["http://rs.tdwg.org/dwc/terms/occurrence"][0]->fields) { echo "\nInvalid archive file. Program will terminate.\n"; return false; } if ($params["dataset"] == "GBIF") { $params["row_type"] = "http://rs.tdwg.org/dwc/terms/occurrence"; $params["location"] = "occurrence.txt"; self::process_row_type($params); } elseif ($params["dataset"] == "iDigBio") { $params["row_type"] = "http://rs.tdwg.org/dwc/terms/occurrence"; $params["location"] = "occurrence.txt"; self::process_row_type($params); } /* old ways self::process_row_type(array("row_type" => 'http://rs.gbif.org/terms/1.0/Multimedia', "location" => "multimedia.txt")); self::create_instances_from_taxon_object($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/occurrence')); self::get_media_objects($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Multimedia')); self::get_objects($harvester->process_row_type('http://eol.org/schema/media/Document')); self::get_references($harvester->process_row_type('http://rs.gbif.org/terms/1.0/Reference')); */ $this->archive_builder->finalize(TRUE); recursive_rmdir($temp_dir); // remove temp dir print_r($this->debug); }
function export_xml_to_archive($params, $xml_file_YN = false) { if (!$xml_file_YN) { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($params["eol_xml_file"], $params["filename"], array("timeout" => 7200, "expire_seconds" => 0)); // "expire_seconds" -- false => won't expire; 0 => expires now //debug print_r($paths); $params["path"] = $paths["temp_dir"]; self::convert_xml($params); $this->archive_builder->finalize(TRUE); recursive_rmdir($paths["temp_dir"]); // remove temp dir } else { $params['path'] = DOC_ROOT . "tmp/"; $local_xml_file = Functions::save_remote_file_to_local($params['eol_xml_file'], array('file_extension' => "xml", 'cache' => 0, "timeout" => 7200, "download_attempts" => 2, "delay_in_minutes" => 2)); //debug - cache should be 0 zero in normal operation $params['filename'] = pathinfo($local_xml_file, PATHINFO_BASENAME); self::convert_xml($params); $this->archive_builder->finalize(TRUE); unlink($local_xml_file); } }
function get_all_taxa() { require_library('connectors/INBioAPI'); $func = new INBioAPI(); $paths = $func->extract_archive_file($this->dwca_file, "meta.xml"); $archive_path = $paths['archive_path']; $harvester = new ContentArchiveReader(NULL, $archive_path); $tables = $harvester->tables; if (!($this->fields["taxa"] = $tables["http://rs.tdwg.org/dwc/terms/taxon"][0]->fields)) { debug("Invalid archive file. Program will terminate."); return false; } $row_types = self::get_XML_fields($paths["temp_dir"] . "meta.xml", "rowType"); /* manual assignment, use this if some referenceID in Measurements don't exist in References.tab $row_types = array("http://eol.org/schema/reference/Reference", "http://rs.tdwg.org/dwc/terms/Taxon", "http://rs.tdwg.org/dwc/terms/MeasurementOrFact", "http://rs.tdwg.org/dwc/terms/Occurrence"); */ print_r($row_types); foreach ($row_types as $row_type) { $basename = pathinfo($row_type, PATHINFO_BASENAME); if ($basename == "Taxon") { $allowed_fields = array("taxonID", "scientificName", "parentNameUsageID", "kingdom", "phylum", "class", "order", "family", "genus", "taxonRank", "furtherInformationURL", "taxonomicStatus", "taxonRemarks", "namePublishedIn", "referenceID"); } else { $allowed_fields = self::get_XML_fields($this->extensions[$basename], "property name"); } //manual adjustment if ($row_type == "VernacularName") { $allowed_fields[] = "taxonID"; } self::process_fields($harvester->process_row_type($row_type), $basename, $allowed_fields); // e.g. self::process_fields($harvester->process_row_type('http://rs.tdwg.org/dwc/terms/Taxon'), "Taxon"); } $this->archive_builder->finalize(TRUE); recursive_rmdir($paths['temp_dir']); // remove temp dir echo "\n temporary directory removed: " . $paths['temp_dir']; print_r($this->debug); }
namespace php_active_record; /* connector for INBio Partner provides DWC-A file estimated execution time: 18 minutes */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); require_library('connectors/INBioAPI'); $resource_id = 276; // $dwca_file = "http://localhost/~eolit/dwca-inbio-eol.zip"; //zip extracts directly to temp_dir // $dwca_file = "http://localhost/~eolit/dwca_inbio.zip"; //zip extracts it within a folder inside temp_dir // $dwca_file = "http://localhost/~eolit/dwca_inbio_small.zip"; // $dwca_file = "http://localhost/~eolit/dwca.tar.gz"; $dwca_file = "http://dl.dropbox.com/u/7597512/INBIO/dwca_inbio.zip"; $func = new INBioAPI(); if ($taxa = $func->get_all_taxa($dwca_file)) { $xml = \SchemaDocument::get_taxon_xml($taxa); $xml = $func->assign_eol_subjects($xml); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = fopen($resource_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path); return; } fwrite($OUT, $xml); fclose($OUT); Functions::set_resource_status_to_force_harvest($resource_id); } $elapsed_time_sec = time_elapsed() - $timestart; echo "\nelapsed time = " . $elapsed_time_sec . " seconds"; echo "\nelapsed time = " . $elapsed_time_sec / 60 . " minutes";
/* connector for National Museum of Natural History Image Collection estimated execution time: 1.5 mins. Connector reads the XML provided by partner and - sets the image rating. - If needed ingests TypeInformation text dataObjects */ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('ResourceDataObjectElementsSetting'); $timestart = time_elapsed(); $resource_id = 120; $resource_path = "http://collections.mnh.si.edu/services/eol/nmnh-iz-response.xml.gz"; //NMNH Invertebrate Zoology resource $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = $row[0]; if ($resource_path != $new_resource_path && $new_resource_path != '') { $resource_path = $new_resource_path; } echo "\n processing resource:\n {$resource_path} \n\n"; $nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2); $xml = $nmnh->set_data_object_rating_on_xml_document(); require_library('connectors/INBioAPI'); $xml = INBioAPI::assign_eol_subjects($xml); $nmnh->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n"; echo "\n\n Done processing.";