public static function validate($uri, $only_well_formedness = false) { if (!$uri) { return false; } // try to find the XSD and fail if it cannot if ($xsd = self::get_schema_location($uri)) { $schema_location = $xsd; } else { return array("There was no XSD defined in this XML file"); } // we have had problems in the past with Services being unavailable, so // instead of checking there just use the local schemas which are the same if ($schema_location == 'http://services.eol.org/schema/content_0_1.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_0_1.xsd'; } if ($schema_location == 'http://services.eol.org/schema/content_0_2.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_0_2.xsd'; } if ($schema_location == 'http://services.eol.org/schema/content_0_3.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_0_3.xsd'; } if ($schema_location == 'http://services.eol.org/schema/content_0_3_18.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_0_3_18.xsd'; } if ($schema_location == 'http://services.eol.org/schema/content_0_4.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_0_4.xsd'; } if ($schema_location == 'http://services.eol.org/schema/content_1_0.xsd') { $schema_location = WEB_ROOT . 'applications/schema/content_1_0.xsd'; } libxml_use_internal_errors(true); libxml_clear_errors(); $reader = new \XMLReader(); $reader->open($uri, 'utf8'); if (!$only_well_formedness) { if (@(!$reader->setSchema($schema_location))) { write_to_resource_harvesting_log("The specified schema could not be loaded or contained errors: {$schema_location}"); return array("The specified schema could not be loaded or contained errors: {$schema_location}"); } } libxml_clear_errors(); while (@$reader->read()) { // empty loop to load errors into libxml error cache //if($reader->name == "#text") echo $reader->name .":". $reader->value."\n"; // if(libxml_get_errors()) // { // echo libxml_get_last_error()->message."\n"; // libxml_clear_errors(); // } } if ($errors = self::get_errors()) { write_to_resource_harvesting_log(implode(",", $errors)); return $errors; } return true; }
private function collect_dataset_attribution() { $this->dataset_metadata = array(); if (is_dir($this->harvest_event->resource->archive_path() . "dataset") && file_exists($this->harvest_event->resource->archive_path() . "dataset/col.xml")) { foreach (glob($this->harvest_event->resource->archive_path() . "dataset/*") as $filename) { if (preg_match("/\\/([0-9]+)\\.xml\$/", $filename, $arr)) { $dataset_id = $arr[1]; } $xml = simplexml_load_file($filename); $title = trim($xml->dataset->title); if (preg_match("/^(.*) in the Catalogue of Life/", $title, $arr)) { $title = trim($arr[1]); } $title = str_replace(" ", " ", $title); $editors = trim($xml->additionalMetadata->metadata->sourceDatabase->authorsAndEditors); if (preg_match("/^(.*)\\. For a full list/", $editors, $arr)) { $editors = trim($arr[1]); } if (preg_match("/^(.*); for detailed information/", $editors, $arr)) { $editors = trim($arr[1]); } $editors = str_replace(" ", " ", $editors); $abbreviatedName = trim($xml->additionalMetadata->metadata->sourceDatabase->abbreviatedName); $this->dataset_metadata[$abbreviatedName]['title'] = $title; $this->dataset_metadata[$abbreviatedName]['editors'] = $editors; $this->dataset_metadata[$abbreviatedName]['abbreviatedName'] = $abbreviatedName; $this->dataset_metadata[$abbreviatedName]['datasetID'] = $dataset_id; $this->dataset_metadata[$dataset_id] =& $this->dataset_metadata[$abbreviatedName]; } // now go grab the citation information from the COL website $url = "http://www.catalogueoflife.org/col/info/cite"; $options_for_log_harvest = array('resource_id' => $this->harvest_event->resource->id); $html = Functions::get_remote_file($url, $options_for_log_harvest); preg_match_all("/<p><strong>(.*?)<\\/strong><br\\/>(.*?)<\\/p>/ims", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $dataset_name = $match[1]; if (preg_match("/^(.*) via ITIS/", $dataset_name, $arr)) { $dataset_name = trim($arr[1]); } $citation = $match[2]; if (isset($this->dataset_metadata[$dataset_name])) { $this->dataset_metadata[$dataset_name]['citation'] = $citation; } elseif ($dataset_name == "Species 2000 Common Names" && isset($this->dataset_metadata["Catalogue of Life"])) { $this->dataset_metadata["Catalogue of Life"]['citation'] = $citation; } } if (!isset($this->dataset_metadata["Catalogue of Life"]['citation']) || !isset($this->dataset_metadata["FishBase"]['citation'])) { echo "Tried getting attribution for Catalogue of Life datasets, but there was a problem\n"; write_to_resource_harvesting_log("Tried getting attribution for Catalogue of Life datasets, but there was a problem"); exit; } } }
public function crop_image_pct($data_object_id, $x_pct, $y_pct, $w_pct, $h_pct = NULL) { //function called by a user interaction (custom crop). If h is not given, assume a square crop $data_object = DataObject::find($data_object_id); if (!$data_object) { write_to_resource_harvesting_log("ContentManager: Cropping invalid data object ID {$data_object_id}"); trigger_error("ContentManager: Cropping invalid data object ID {$data_object_id}", E_USER_NOTICE); } elseif ($data_object->is_image() && $data_object->object_cache_url) { /* we have problems because we don't actually save the filename extension of the original file. Until we can get this from the database, we hack around this as follows */ $cache_path = self::cache_num2path($data_object->object_cache_url); foreach (self::$valid_image_extensions as $ext) { $image_url = CONTENT_LOCAL_PATH . $cache_path . "." . $ext; if (is_file($image_url)) { break; } } // If we can't find the original download, save the local or previous jpg versions as the original (yuck) if (!is_file($image_url)) { $image_url = CONTENT_LOCAL_PATH . $cache_path . "_orig.jpg"; } if (!is_file($image_url)) { $image_url = "http://content.eol.org/content/" . $cache_path . "_orig.jpg"; } return $this->grab_file($image_url, "image", array('crop_pct' => array($x_pct, $y_pct, $w_pct, $h_pct), 'data_object_id' => $data_object->id, 'data_object_guid' => $data_object->guid)); } }
public function insert_data($options = array()) { if ($options['data']) { $query = self::append_namespaces_to_query(); $query .= " INSERT DATA INTO <" . $options['graph_name'] . "> { " . implode($options['data'], " .\n") . " }"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $this->upload_uri); curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-type: application/sparql-query')); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_FAILONERROR, 1); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 20); curl_setopt($ch, CURLOPT_TIMEOUT, 60); curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); curl_setopt($ch, CURLOPT_USERPWD, $this->username . ":" . $this->password); curl_setopt($ch, CURLOPT_POSTFIELDS, $query); $result = curl_exec($ch); if (curl_errno($ch) == 0) { curl_close($ch); return $result; } echo "\n\n=========================================\n"; echo 'Curl error: ' . curl_error($ch) . "\n\n"; echo "{$query}\n\n"; print_r($options); print_r(serialize($options['data'])); echo "===========================================\n\n"; write_to_resource_harvesting_log("ERROR: Inserting data in virtuoso"); write_to_resource_harvesting_log('Curl error: ' . curl_error($ch)); write_to_resource_harvesting_log($query); return false; } }
public function validate_row($row, $parameters) { static $i = 0; $i++; if ($i % 10000 == 0 && $GLOBALS['ENV_DEBUG']) { echo "{$i}: " . time_elapsed() . " :: " . memory_get_usage() . "\n"; write_to_resource_harvesting_log($i . ": " . time_elapsed() . "::" . memory_get_usage()); } $file_location = $parameters['archive_table_definition']->location; $new_exceptions = array(); if ($parameters['row_type'] == 'http://eol.org/schema/media/document') { if (@(!$row['http://purl.org/dc/terms/license']) && @(!$row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'])) { if ($this->archive_resource && $this->archive_resource->license && $this->archive_resource->license->source_url) { $row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'] = $this->archive_resource->license->source_url; unset($row['http://purl.org/dc/terms/license']); } } $new_exceptions = \eol_schema\MediaResource::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions); if (!self::any_exceptions_of_type_error($new_exceptions)) { if (@($v = $row['http://purl.org/dc/terms/type'])) { $this->add_stat('type', $parameters['row_type'], $file_location, $v); } if (@($v = $row['http://rs.tdwg.org/audubon_core/subtype'])) { $this->add_stat('subtype', $parameters['row_type'], $file_location, $v); } if (@($v = $row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'])) { $this->add_stat('license', $parameters['row_type'], $file_location, $v); } if (@($v = $row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) { $this->add_stat('subject', $parameters['row_type'], $file_location, $v); } if (@($v = $row['http://purl.org/dc/terms/language'])) { $this->add_stat('language', $parameters['row_type'], $file_location, $v); } if (@($v = $row['http://purl.org/dc/terms/format'])) { $this->add_stat('format', $parameters['row_type'], $file_location, $v); } } } elseif ($parameters['row_type'] == 'http://rs.tdwg.org/dwc/terms/taxon') { $new_exceptions = \eol_schema\Taxon::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://rs.tdwg.org/dwc/terms/taxonID', $parameters, $new_exceptions); } elseif ($parameters['row_type'] == 'http://rs.gbif.org/terms/1.0/vernacularname') { $new_exceptions = \eol_schema\VernacularName::validate_by_hash($row, $this->skip_warnings); if (!self::any_exceptions_of_type_error($new_exceptions)) { if (@($v = $row['http://purl.org/dc/terms/language'])) { $this->add_stat('language', $parameters['row_type'], $file_location, $v); } } } elseif ($parameters['row_type'] == 'http://eol.org/schema/reference/reference') { $new_exceptions = \eol_schema\Reference::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions); } elseif ($parameters['row_type'] == 'http://eol.org/schema/agent/agent') { $new_exceptions = \eol_schema\Agent::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions); } elseif ($parameters['row_type'] == 'http://rs.tdwg.org/dwc/terms/measurementorfact') { $new_exceptions = \eol_schema\MeasurementOrFact::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://rs.tdwg.org/dwc/terms/measurementID', $parameters, $new_exceptions); } elseif ($parameters['row_type'] == 'http://eol.org/schema/association') { $new_exceptions = \eol_schema\Association::validate_by_hash($row, $this->skip_warnings); $this->append_identifier_error($row, 'http://eol.org/schema/associationID', $parameters, $new_exceptions); } if (!self::any_exceptions_of_type_error($new_exceptions)) { if (!isset($this->stats[$parameters['row_type']])) { $this->stats[$parameters['row_type']] = array(); } if (!isset($this->stats[$parameters['row_type']]['Total'])) { $this->stats[$parameters['row_type']]['Total'] = 0; } $this->stats[$parameters['row_type']]['Total']++; } if ($new_exceptions) { foreach ($new_exceptions as $exception) { $exception->file = $parameters['archive_table_definition']->location; $exception->line = $parameters['archive_line_number']; if (get_class($exception) == 'eol_schema\\ContentArchiveError') { if (!isset($this->errors_by_line[$parameters['row_type']][$file_location][$exception->line])) { $this->errors_by_line[$parameters['row_type']][$file_location][$exception->line] = array(); } $this->errors_by_line[$parameters['row_type']][$file_location][$exception->line][] = $exception; } elseif (!$this->skip_warnings) { if (!isset($this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line])) { $this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line] = array(); } $this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line][] = $exception; } } } }