Пример #1
0
 private function update_eol_xml($lifedesk_name)
 {
     /*
     taxon = 434
     dwc:ScientificName = 434
     reference = 614
     synonym = 68
     commonName = 2
     dataObjects = 1705
     reference = 0
     texts = 1146
     images = 559
     videos = 0
     sounds = 0
     */
     require_library('ResourceDataObjectElementsSetting');
     $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml";
     $func = new ResourceDataObjectElementsSetting($lifedesk_name, $resource_path);
     $xml = file_get_contents($this->text_path["eol_xml"]);
     $xml = $func->replace_taxon_element_value("dc:source", "replace any existing value", "", $xml, false);
     $xml = $func->replace_data_object_element_value("dc:source", "replace any existing value", "", $xml, false);
     $xml = self::remove_tags_in_references($xml);
     $func->save_resource_document($xml);
     // zip the xml
     $command_line = "gzip -c " . CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml >" . CONTENT_RESOURCE_LOCAL_PATH . $lifedesk_name . ".xml.gz";
     $output = shell_exec($command_line);
 }
Пример #2
0
function remove_dataObject()
{
    require_library('ResourceDataObjectElementsSetting');
    $resource_id = 346;
    $resource_path = "http://localhost/eol_php_code/applications/content_server/resources/346.xml.gz";
    $nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
    $xml = $nmnh->load_xml_string();
    $xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "image/x-adobe-dng", $xml);
    $nmnh->save_resource_document($xml);
}
Пример #3
0
<?php

namespace php_active_record;

/* connector for Royal Botanic Garden Edinburgh: Herbarium Specimen Images (E)
estimated execution time: 
There is already a published data for this resource that is set to 'import once'. 
The connector modifies the 336.xml in Beast.
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
$resource_id = 336;
//--------------
/* set rating to 2 */
require_library('ResourceDataObjectElementsSetting');
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2);
$xml = $func->set_data_object_rating_on_xml_document();
$func->save_resource_document($xml);
//--------------
Functions::set_resource_status_to_force_harvest($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n";
echo "\n\n Done processing.";
Пример #4
0
namespace php_active_record;

/* connector for ARKive
estimated execution time:
There is already a published data for this resource. This connector modifies the resource 6.xml in Beast.
Partner is un-responsive for a while so we decided to just adjust the resource XML ourselves.
This connector will just change all text objects with subject #Description to #TaxonBiology.
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
$resource_id = 6;
/* change subject mapping from #Description to #TaxonBiology */
require_library('ResourceDataObjectElementsSetting');
$resource_path = "http://dl.dropbox.com/u/7597512/resources/6.xml.gz";
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
$row = $result->fetch_row();
$new_resource_path = $row[0];
if ($resource_path != $new_resource_path && $new_resource_path != '') {
    $resource_path = $new_resource_path;
}
print "\n processing resource:\n {$resource_path} \n\n";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml = $func->load_xml_string();
$xml = $func->replace_data_object_element_value_with_condition("subject", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#GeneralDescription", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology", $xml, "dc:title", "Description");
$func->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
echo "\n\n Done processing.";
Пример #5
0
/*connector for The Biodiversity of Tamborine Mountain
estimated execution time: 
Partner provides a list of URL's for its individual species XML.
The connector loops to this list and compiles each XML to 1 final XML for EOL ingestion.
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
require_library('connectors/ConabioAPI');
$resource_id = 106;
$func = new ConabioAPI();
$func->combine_all_xmls($resource_id);
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
// /* working well - replaces Class='Insecta' to 'Reptilia' if Order=='Squamata' --- WEB-5509
require_library('ResourceDataObjectElementsSetting');
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml = file_get_contents($resource_path);
$xml = $func->replace_taxon_element_value_with_condition("dwc:Class", "Insecta", "Reptilia", $xml, "dwc:Order", "Squamata");
$func->save_resource_document($xml);
// */
// start - this will get Tamborines videos from Vimeo and append it with the main resource 106.xml (DATA-1592)
Functions::file_rename($resource_path, CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine1.xml");
get_videos_from_vimeo();
Functions::combine_all_eol_resource_xmls($resource_id, CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine*.xml");
unlink(CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine1.xml");
unlink(CONTENT_RESOURCE_LOCAL_PATH . "temp_vimeo_to_tamborine2.xml");
// end
if (filesize($resource_path) > 1000) {
    Functions::set_resource_status_to_force_harvest($resource_id);
    Functions::gzip_resource_xml($resource_id);
}
Пример #6
0
    debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $new_resource_path);
    return;
}
fwrite($OUT, $new_resource);
fclose($OUT);
shell_exec("gunzip -f " . $new_resource_path);
$new_resource_path = DOC_ROOT . "temp/22.xml";
$xml = file_get_contents($new_resource_path);
// $xml = str_replace("<dc:description>", "<dc:description><![CDATA[", $xml);
// $xml = str_replace("</dc:description>", "]]></dc:description>", $xml);
$xml = preg_replace("/<a>([^<]+)<\\/a>/", "\\1", $xml);
if (substr_count($xml, "<?xml") == 0) {
    $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . $xml;
}
$old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . "22.xml";
if (!($OUT = fopen($old_resource_path, "w+"))) {
    debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $old_resource_path);
    return;
}
fwrite($OUT, $xml);
fclose($OUT);
shell_exec("rm " . $new_resource_path);
$resource_id = 22;
require_library('ResourceDataObjectElementsSetting');
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml_string = file_get_contents($resource_path);
$xml = $func->remove_data_object_of_certain_element_value("mediaURL", "http://animaldiversity.ummz.umich.edu/", $xml_string);
$func->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
Functions::gzip_resource_xml($resource_id);
Пример #7
0
estimated execution time: 5 minutes
This script will modify the original Efloras resource (17_orig.xml).
    - change subject GeneralDescript to Morphology
    - remove all references
    - splits the "habitat & distribution" into "habitat" and "distribution", each a <dataObject> of its own
    - then split habitat further into #cyclicity (flowering-time) and #habitat
    - re-map text objects with title 'Comments' to http://www.eol.org/voc/table_of_contents#Notes
    - change schema ver. from 0.1 to 0.3
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
require_library('ResourceDataObjectElementsSetting');
$timestart = time_elapsed();
$resource_id = 17;
// $resource_path = "http://localhost/~eolit/eol_php_code/applications/content_server/resources/17_test.xml"; //test data
$resource_path = "http://dl.dropbox.com/u/7597512/resources/17_orig.xml.gz";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml = $func->load_xml_string();
//removes the <reference> entries, faster this way than to loop each entry.
$xml = preg_replace("/<reference (.*?)>/ims", "<reference>", $xml);
$xml = preg_replace("/<reference>(.*?)<\\/reference>/ims", "", $xml);
//replace schema 0.1 to 0.3 because the new resource uses <additionalInformation> element
$xml = str_ireplace("/transfer/content/0.1", "/transfer/content/0.3", $xml);
$xml = str_ireplace("/content_0_1.xsd", "/content_0_3.xsd", $xml);
//re-maps the #GeneralDescription to #Morphology
$xml = str_ireplace("<subject>http://rs.tdwg.org/ontology/voc/SPMInfoItems#GeneralDescription</subject>", "<subject>http://rs.tdwg.org/ontology/voc/SPMInfoItems#Morphology</subject>", $xml);
//splits the "habitat & distribution" into "habitat" and "distribution", each a <dataObject> of its own
$xml = split_habitat_and_distribution($xml);
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
if (!($OUT = fopen($resource_path, "w"))) {
    debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path);
    return;
Пример #8
0
- If needed ingests TypeInformation text dataObjects
- replaces wrong mimeType value
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
require_library('ResourceDataObjectElementsSetting');
$timestart = time_elapsed();
$resource_id = 341;
$resource_path = "http://collections.mnh.si.edu/services/eol/nmnh-birds-response.xml.gz";
//Birds resource
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
$row = $result->fetch_row();
$new_resource_path = $row[0];
if ($resource_path != $new_resource_path && $new_resource_path != '') {
    $resource_path = $new_resource_path;
}
echo "\n processing resource:\n {$resource_path} \n\n";
$nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2);
$xml = $nmnh->set_data_object_rating_on_xml_document();
require_library('connectors/INBioAPI');
$xml = INBioAPI::assign_eol_subjects($xml);
$xml = $nmnh->replace_data_object_element_value("mimeType", "audio/wav", "audio/x-wav", $xml);
$xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "audio/x-wav", $xml);
// to exclude <dataObject>'s of this element and value
$nmnh->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n";
echo "\n\n Done processing.";
Пример #9
0
function remove_bhl_images_already_existing_in_eol_group($resource_id)
{
    $file = "http://dl.dropbox.com/u/7597512/BHL_images/BHL_images_in_EOLGroup.txt";
    // $file = "http://localhost/cp/BHL/BHL_images/BHL_images_in_EOLGroup.txt";
    $contents = Functions::get_remote_file($file, array('timeout' => 600, 'download_attempts' => 5));
    $do_ids = json_decode($contents, true);
    print "\n\n from text file: " . count($do_ids);
    $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
    $xml_string = Functions::get_remote_file($resource_path, array('timeout' => 240, 'download_attempts' => 5));
    $xml = simplexml_load_string($xml_string);
    $i = 0;
    $deleted_ids = array();
    $deleted = 0;
    foreach ($xml->taxon as $taxon) {
        $i++;
        $dwc = $taxon->children("http://rs.tdwg.org/dwc/dwcore/");
        echo "\n[" . $dwc->ScientificName . "]";
        $j = 0;
        $deleted_do_keys = array();
        foreach ($taxon->dataObject as $do) {
            $j++;
            $dc2 = $do->children("http://purl.org/dc/elements/1.1/");
            $do_id = trim($dc2->identifier);
            if (in_array($do_id, $do_ids)) {
                $deleted++;
                $deleted_ids[$do_id] = 1;
                print "\n --- deleting {$do_id}";
                $deleted_do_keys[] = $j - 1;
            }
        }
        foreach ($deleted_do_keys as $key) {
            unset($xml->taxon[$i - 1]->dataObject[$key]);
        }
    }
    print "\n\n occurrence do_ids: {$i}";
    print "\n\n deleted <dataObject>s: {$deleted}";
    print "\n\n deleted unique do_ids: " . count($deleted_ids);
    $xml_string = $xml->asXML();
    require_library('ResourceDataObjectElementsSetting');
    $xml_string = ResourceDataObjectElementsSetting::delete_taxon_if_no_dataObject($xml_string);
    if (!($WRITE = Functions::file_open($resource_path, "w"))) {
        return;
    }
    fwrite($WRITE, $xml_string);
    fclose($WRITE);
}
Пример #10
0
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
require_library('ResourceDataObjectElementsSetting');
$timestart = time_elapsed();
$resource_id = 20;
$resource_path = "http://www.pensoft.net/J_FILES/EoLData/ZooKeys.xml";
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
if ($result && ($row = $result->fetch_row())) {
    $resource_path_from_registry = $row[0];
    if ($resource_path != $resource_path_from_registry && $resource_path_from_registry != '') {
        $resource_path = $resource_path_from_registry;
    }
}
echo "\n processing resource: {$resource_path} \n";
if ($local_path = Functions::save_remote_file_to_local($resource_path, array('download_wait_time' => 1000000, 'timeout' => 600, 'download_attempts' => 5))) {
    $func = new ResourceDataObjectElementsSetting($resource_id, $local_path);
    $dataObjects = get_values($local_path);
    $xml = remove_elements($local_path);
    $func->save_resource_document($xml);
    $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
    $xml = fill_up_values($resource_path, $dataObjects);
    $func->save_resource_document($xml);
    Functions::set_resource_status_to_force_harvest($resource_id);
    // remove tmp file
    unlink($local_path);
    debug("\n temporary file removed: [{$local_path}]");
}
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
Пример #11
0
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
$resource_id = 39;
require_library('ResourceDataObjectElementsSetting');
// $resource_path = "http://localhost/~eolit/cp/UnivAlberta/data.xml.gz";
// $resource_path = "http://project.macs.ualberta.ca/services/eol/data.xml.gz";
$resource_path = "https://dl.dropboxusercontent.com/u/7597512/UnivAlberta/data.xml.gz";
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
$row = $result->fetch_row();
$new_resource_path = @$row[0];
if ($resource_path != $new_resource_path && $new_resource_path != '') {
    $resource_path = $new_resource_path;
}
print "\n processing resource:\n {$resource_path} \n\n";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml = $func->load_xml_string();
$xml = fix_url_format($xml);
$func->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
Functions::gzip_resource_xml($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
echo "\n\n Done processing.";
function fix_url_format($xml_string)
{
    if ($xml = simplexml_load_string($xml_string)) {
        echo "\nfixing URL format " . count($xml->taxon) . "-- please wait...";
        foreach ($xml->taxon as $taxon) {
Пример #12
0
namespace php_active_record;

/* connector for Natural History Museum Species of the day
estimated execution time: 1 second
There is already a published data for this resource. This connector modifies the resource 281.xml in Beast.
This resource is set to import once.
This connector will replace all <subject> elements to "#TaxonBiology" only if <dc:title> is "Introduction".
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
$timestart = time_elapsed();
$resource_id = 281;
require_library('ResourceDataObjectElementsSetting');
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
//if partner will provide an accesspointURL the connector will use that, if not it will use what is uploaded.
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
$row = $result->fetch_row();
$new_resource_path = $row[0];
if ($new_resource_path != '') {
    $resource_path = $new_resource_path;
}
print "\n processing resource:\n {$resource_path} \n\n";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml = Functions::get_remote_file($resource_path);
$xml = $func->replace_data_object_element_value_with_condition("subject", "", "http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology", $xml, "dc:title", "Introduction", false);
$func->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";
echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes  \n";
echo "\n\n Done processing.";
Пример #13
0
<?php

namespace php_active_record;

/*
accesspoint_url = "http://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-pages-articles.xml.bz2";
*/
define('DOWNLOAD_WAIT_TIME', '1000000');
// 2 second wait after every web request
include_once dirname(__FILE__) . "/../../config/environment.php";
// $GLOBALS['ENV_DEBUG'] = false;
define("WIKI_USER_PREFIX", "http://commons.wikimedia.org/wiki/User:"******"WIKI_PREFIX", "http://commons.wikimedia.org/wiki/");
require_vendor("wikipedia");
$resource_id = 71;
if (!Functions::can_this_connector_run($resource_id)) {
    return;
}
$w = new WikimediaHarvester(Resource::find($resource_id));
$w->begin_wikimedia_harvest("update_resources/connectors/files/");
sleep(120);
// delay 2 mins.
require_library("ResourceDataObjectElementsSetting");
$resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml";
$func = new ResourceDataObjectElementsSetting($resource_id, $resource_path);
$xml_string = file_get_contents($resource_path);
$xml = $func->remove_data_object_of_certain_element_value("dataType", "", $xml_string);
$func->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
Пример #14
0
- If needed ingests TypeInformation text dataObjects
*/
include_once dirname(__FILE__) . "/../../config/environment.php";
require_library('ResourceDataObjectElementsSetting');
$timestart = time_elapsed();
$resource_id = 346;
$resource_path = "http://collections.mnh.si.edu/services/eol/nmnh-botany-response.xml.gz";
//Botany Resource
$result = $GLOBALS['db_connection']->select("SELECT accesspoint_url FROM resources WHERE id={$resource_id}");
$row = $result->fetch_row();
$new_resource_path = $row[0];
if ($resource_path != $new_resource_path && $new_resource_path != '') {
    $resource_path = $new_resource_path;
}
echo "\n processing resource:\n {$resource_path} \n\n";
$nmnh = new ResourceDataObjectElementsSetting($resource_id, $resource_path, 'http://purl.org/dc/dcmitype/StillImage', 2);
$xml = $nmnh->set_data_object_rating_on_xml_document();
//manual fix DATA-1189, until partner fixes their data
$xml = str_ireplace("Photograph of Photograph of", "Photograph of", $xml);
//manual fix DATA-1205
$xml = replace_Indet_sp($xml);
$xml = remove_blank_taxon_entry($xml);
require_library('connectors/INBioAPI');
$xml = INBioAPI::assign_eol_subjects($xml);
//fix DATA-1420
$xml = $nmnh->remove_data_object_of_certain_element_value("mimeType", "image/x-adobe-dng", $xml);
$nmnh->save_resource_document($xml);
Functions::set_resource_status_to_force_harvest($resource_id);
$elapsed_time_sec = time_elapsed() - $timestart;
echo "\n";
echo "elapsed time = {$elapsed_time_sec} seconds             \n";