private function update_eol_xml($lifedesk_name) { /* taxon = 434 dwc:ScientificName = 434 reference = 614 synonym = 68 commonName = 2 dataObjects = 1705 reference = 0 texts = 1146 images = 559 videos = 0 sounds = 0 */ require_library('ResourceDataObjectElementsSetting'); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml"; $func = new ResourceDataObjectElementsSetting($lifedesk_name, $resource_path); $xml = file_get_contents($this->text_path["eol_xml"]); $xml = $func->replace_taxon_element_value("dc:source", "replace any existing value", "", $xml, false); $xml = $func->replace_data_object_element_value("dc:source", "replace any existing value", "", $xml, false); $xml = self::remove_tags_in_references($xml); $func->save_resource_document($xml); // zip the xml $command_line = "gzip -c " . CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml >" . CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml.gz"; $output = shell_exec($command_line); }
function remove_dataObject() { require_library('ResourceDataObjectElementsSetting'); $resource_id = 346; $resource_path = "http://localhost/eol_php_code/applications/content_server/resources/346.xml.gz"; $nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = $nmnh->load_xml_string(); $xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "image/x-adobe-dng", $xml); $nmnh->save_resource_document($xml); }
<?php namespace php_active_record; /* connector for Royal Botanic Garden Edinburgh: Herbarium Specimen Images (E) estimated execution time: There is already a published data for this resource that is set to 'import once'. The connector modifies the 336.xml in Beast. */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $resource_id = 336; //-------------- /* set rating to 2 */ require_library('ResourceDataObjectElementsSetting'); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2); $xml = $func->set_data_object_rating_on_xml_document(); $func->save_resource_document($xml); //-------------- Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n"; echo "\n\n Done processing.";
namespace php_active_record; /* connector for ARKive estimated execution time: There is already a published data for this resource. This connector modifies the resource 6.xml in Beast. Partner is un-responsive for a while so we decided to just adjust the resource XML ourselves. This connector will just change all text objects with subject #Description to #TaxonBiology. */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $resource_id = 6; /* change subject mapping from #Description to #TaxonBiology */ require_library('ResourceDataObjectElementsSetting'); $resource_path = "http://dl.dropbox.com/u/7597512/resources/6.xml.gz"; $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = $row[0]; if ($resource_path != $new_resource_path && $new_resource_path != '') { $resource_path = $new_resource_path; } print "\n processing resource:\n {$resource_path} \n\n"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = $func->load_xml_string(); $xml = $func->replace_data_object_element_value_with_condition("subject", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#GeneralDescription", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology", $xml, "dc:title", "Description"); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "\n\n Done processing.";
/*connector for The Biodiversity of Tamborine Mountain estimated execution time: Partner provides a list of URL's for its individual species XML. The connector loops to this list and compiles each XML to 1 final XML for EOL ingestion. */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); require_library('connectors/ConabioAPI'); $resource_id = 106; $func = new ConabioAPI(); $func->combine_all_xmls($resource_id); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; // /* working well - replaces Class='Insecta' to 'Reptilia' if Order=='Squamata' --- WEB-5509 require_library('ResourceDataObjectElementsSetting'); $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = file_get_contents($resource_path); $xml = $func->replace_taxon_element_value_with_condition("dwc:Class", "Insecta", "Reptilia", $xml, "dwc:Order", "Squamata"); $func->save_resource_document($xml); // */ // start - this will get Tamborines videos from Vimeo and append it with the main resource 106.xml (DATA-1592) Functions::file_rename($resource_path, CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine1.xml"); get_videos_from_vimeo(); Functions::combine_all_eol_resource_xmls($resource_id, CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine*.xml"); unlink(CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine1.xml"); unlink(CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine2.xml"); // end if (filesize($resource_path) > 1000) { Functions::set_resource_status_to_force_harvest($resource_id); Functions::gzip_resource_xml($resource_id); }
debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $new_resource_path); return; } fwrite($OUT, $new_resource); fclose($OUT); shell_exec("gunzip -f " . $new_resource_path); $new_resource_path = DOC_ROOT . "temp/22.xml"; $xml = file_get_contents($new_resource_path); // $xml = str_replace("<dc:description>", "<dc:description><![CDATA[", $xml); // $xml = str_replace("</dc:description>", "]]></dc:description>", $xml); $xml = preg_replace("/<a>([^<]+)<\\/a>/", "\\1", $xml); if (substr_count($xml, "<?xml") == 0) { $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . $xml; } $old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . "22.xml"; if (!($OUT = fopen($old_resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $old_resource_path); return; } fwrite($OUT, $xml); fclose($OUT); shell_exec("rm " . $new_resource_path); $resource_id = 22; require_library('ResourceDataObjectElementsSetting'); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml_string = file_get_contents($resource_path); $xml = $func->remove_data_object_of_certain_element_value("mediaURL", "http://animaldiversity.ummz.umich.edu/", $xml_string); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); Functions::gzip_resource_xml($resource_id);
estimated execution time: 5 minutes This script will modify the original Efloras resource (17_orig.xml). - change subject GeneralDescript to Morphology - remove all references - splits the "habitat & distribution" into "habitat" and "distribution", each a <dataObject> of its own - then split habitat further into #cyclicity (flowering-time) and #habitat - re-map text objects with title 'Comments' to http://www.eol.org/voc/table_of_contents#Notes - change schema ver. from 0.1 to 0.3 */ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('ResourceDataObjectElementsSetting'); $timestart = time_elapsed(); $resource_id = 17; // $resource_path = "http://localhost/~eolit/eol_php_code/applications/content_server/resources/17_test.xml"; //test data $resource_path = "http://dl.dropbox.com/u/7597512/resources/17_orig.xml.gz"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = $func->load_xml_string(); //removes the <reference> entries, faster this way than to loop each entry. $xml = preg_replace("/<reference (.*?)>/ims", "<reference>", $xml); $xml = preg_replace("/<reference>(.*?)<\\/reference>/ims", "", $xml); //replace schema 0.1 to 0.3 because the new resource uses <additionalInformation> element $xml = str_ireplace("/transfer/content/0.1", "/transfer/content/0.3", $xml); $xml = str_ireplace("/content_0_1.xsd", "/content_0_3.xsd", $xml); //re-maps the #GeneralDescription to #Morphology $xml = str_ireplace("<subject>http://rs.tdwg.org/ontology/voc/SPMInfoItems#GeneralDescription</subject>", "<subject>http://rs.tdwg.org/ontology/voc/SPMInfoItems#Morphology</subject>", $xml); //splits the "habitat & distribution" into "habitat" and "distribution", each a <dataObject> of its own $xml = split_habitat_and_distribution($xml); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = fopen($resource_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path); return;
- If needed ingests TypeInformation text dataObjects - replaces wrong mimeType value */ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('ResourceDataObjectElementsSetting'); $timestart = time_elapsed(); $resource_id = 341; $resource_path = "http://collections.mnh.si.edu/services/eol/nmnh-birds-response.xml.gz"; //Birds resource $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = $row[0]; if ($resource_path != $new_resource_path && $new_resource_path != '') { $resource_path = $new_resource_path; } echo "\n processing resource:\n {$resource_path} \n\n"; $nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2); $xml = $nmnh->set_data_object_rating_on_xml_document(); require_library('connectors/INBioAPI'); $xml = INBioAPI::assign_eol_subjects($xml); $xml = $nmnh->replace_data_object_element_value("mimeType", "audio/wav", "audio/x-wav", $xml); $xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "audio/x-wav", $xml); // to exclude <dataObject>'s of this element and value $nmnh->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n"; echo "\n\n Done processing.";
function remove_bhl_images_already_existing_in_eol_group($resource_id) { $file = "http://dl.dropbox.com/u/7597512/BHL_images/BHL_images_in_EOLGroup.txt"; // $file = "http://localhost/cp/BHL/BHL_images/BHL_images_in_EOLGroup.txt"; $contents = Functions::get_remote_file($file, array('timeout' => 600, 'download_attempts' => 5)); $do_ids = json_decode($contents, true); print "\n\n from text file: " . count($do_ids); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $xml_string = Functions::get_remote_file($resource_path, array('timeout' => 240, 'download_attempts' => 5)); $xml = simplexml_load_string($xml_string); $i = 0; $deleted_ids = array(); $deleted = 0; foreach ($xml->taxon as $taxon) { $i++; $dwc = $taxon->children("http://rs.tdwg.org/dwc/dwcore/"); echo "\n[" . $dwc->ScientificName . "]"; $j = 0; $deleted_do_keys = array(); foreach ($taxon->dataObject as $do) { $j++; $dc2 = $do->children("http://purl.org/dc/elements/1.1/"); $do_id = trim($dc2->identifier); if (in_array($do_id, $do_ids)) { $deleted++; $deleted_ids[$do_id] = 1; print "\n --- deleting {$do_id}"; $deleted_do_keys[] = $j - 1; } } foreach ($deleted_do_keys as $key) { unset($xml->taxon[$i - 1]->dataObject[$key]); } } print "\n\n occurrence do_ids: {$i}"; print "\n\n deleted <dataObject>s: {$deleted}"; print "\n\n deleted unique do_ids: " . count($deleted_ids); $xml_string = $xml->asXML(); require_library('ResourceDataObjectElementsSetting'); $xml_string = ResourceDataObjectElementsSetting::delete_taxon_if_no_dataObject($xml_string); if (!($WRITE = Functions::file_open($resource_path, "w"))) { return; } fwrite($WRITE, $xml_string); fclose($WRITE); }
*/ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('ResourceDataObjectElementsSetting'); $timestart = time_elapsed(); $resource_id = 20; $resource_path = "http://www.pensoft.net/J_FILES/EoLData/ZooKeys.xml"; $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); if ($result && ($row = $result->fetch_row())) { $resource_path_from_registry = $row[0]; if ($resource_path != $resource_path_from_registry && $resource_path_from_registry != '') { $resource_path = $resource_path_from_registry; } } echo "\n processing resource: {$resource_path} \n"; if ($local_path = Functions::save_remote_file_to_local($resource_path, array('download_wait_time' => 1000000, 'timeout' => 600, 'download_attempts' => 5))) { $func = new ResourceDataObjectElementsSetting($resource_id, $local_path); $dataObjects = get_values($local_path); $xml = remove_elements($local_path); $func->save_resource_document($xml); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $xml = fill_up_values($resource_path, $dataObjects); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); // remove tmp file unlink($local_path); debug("\n temporary file removed: [{$local_path}]"); } $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n";
*/ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $resource_id = 39; require_library('ResourceDataObjectElementsSetting'); // $resource_path = "http://localhost/~eolit/cp/UnivAlberta/data.xml.gz"; // $resource_path = "http://project.macs.ualberta.ca/services/eol/data.xml.gz"; $resource_path = "https://dl.dropboxusercontent.com/u/7597512/UnivAlberta/data.xml.gz"; $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = @$row[0]; if ($resource_path != $new_resource_path && $new_resource_path != '') { $resource_path = $new_resource_path; } print "\n processing resource:\n {$resource_path} \n\n"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = $func->load_xml_string(); $xml = fix_url_format($xml); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); Functions::gzip_resource_xml($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "\n\n Done processing."; function fix_url_format($xml_string) { if ($xml = simplexml_load_string($xml_string)) { echo "\nfixing URL format " . count($xml->taxon) . "-- please wait..."; foreach ($xml->taxon as $taxon) {
namespace php_active_record; /* connector for Natural History Museum Species of the day estimated execution time: 1 second There is already a published data for this resource. This connector modifies the resource 281.xml in Beast. This resource is set to import once. This connector will replace all <subject> elements to "#TaxonBiology" only if <dc:title> is "Introduction". */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $resource_id = 281; require_library('ResourceDataObjectElementsSetting'); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; //if partner will provide an accesspointURL the connector will use that, if not it will use what is uploaded. $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = $row[0]; if ($new_resource_path != '') { $resource_path = $new_resource_path; } print "\n processing resource:\n {$resource_path} \n\n"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml = Functions::get_remote_file($resource_path); $xml = $func->replace_data_object_element_value_with_condition("subject", "", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology", $xml, "dc:title", "Introduction", false); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "\n\n Done processing.";
<?php namespace php_active_record; /* accesspoint_url = "http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2"; */ define('DOWNLOAD_WAIT_TIME', '1000000'); // 2 second wait after every web request include_once dirname(__FILE__) . "/../../config/environment.php"; // $GLOBALS['ENV_DEBUG'] = false; define("WIKI_USER_PREFIX", "http://commons.wikimedia.org/wiki/User:"******"WIKI_PREFIX", "http://commons.wikimedia.org/wiki/"); require_vendor("wikipedia"); $resource_id = 71; if (!Functions::can_this_connector_run($resource_id)) { return; } $w = new WikimediaHarvester(Resource::find($resource_id)); $w->begin_wikimedia_harvest("update_resources/connectors/files/"); sleep(120); // delay 2 mins. require_library("ResourceDataObjectElementsSetting"); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $func = new ResourceDataObjectElementsSetting($resource_id, $resource_path); $xml_string = file_get_contents($resource_path); $xml = $func->remove_data_object_of_certain_element_value("dataType", "", $xml_string); $func->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id);
- If needed ingests TypeInformation text dataObjects */ include_once dirname(__FILE__) . "/../../config/environment.php"; require_library('ResourceDataObjectElementsSetting'); $timestart = time_elapsed(); $resource_id = 346; $resource_path = "http://collections.mnh.si.edu/services/eol/nmnh-botany-response.xml.gz"; //Botany Resource $result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}"); $row = $result->fetch_row(); $new_resource_path = $row[0]; if ($resource_path != $new_resource_path && $new_resource_path != '') { $resource_path = $new_resource_path; } echo "\n processing resource:\n {$resource_path} \n\n"; $nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2); $xml = $nmnh->set_data_object_rating_on_xml_document(); //manual fix DATA-1189, until partner fixes their data $xml = str_ireplace("Photograph of Photograph of", "Photograph of", $xml); //manual fix DATA-1205 $xml = replace_Indet_sp($xml); $xml = remove_blank_taxon_entry($xml); require_library('connectors/INBioAPI'); $xml = INBioAPI::assign_eol_subjects($xml); //fix DATA-1420 $xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "image/x-adobe-dng", $xml); $nmnh->save_resource_document($xml); Functions::set_resource_status_to_force_harvest($resource_id); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = {$elapsed_time_sec} seconds \n";