예제 #1
0
 public function insert_data_object($row, $parameters)
 {
     self::debug_iterations("Inserting DataObject");
     $this->commit_iterations("DataObject", 20);
     if ($this->archive_validator->has_error_by_line('http://eol.org/schema/media/document', $parameters['archive_table_definition']->location, $parameters['archive_line_number'])) {
         write_to_resource_harvesting_log("ERROR: insert_data_object: has_error_by_line" . ",file_location:" . $parameters['archive_table_definition']->location . ",line_number:" . $parameters['archive_line_number']);
         return false;
     }
     $object_taxon_ids = self::get_foreign_keys_from_row($row, 'http://rs.tdwg.org/dwc/terms/taxonID');
     $object_taxon_info = array();
     if ($object_taxon_ids) {
         foreach ($object_taxon_ids as $taxon_id) {
             if ($taxon_info = @$this->taxon_ids_inserted[$taxon_id]) {
                 self::uncompress_array($taxon_info);
                 $object_taxon_info[] = $taxon_info;
             }
         }
     }
     if (!$object_taxon_info) {
         return false;
     }
     if ($this->harvest_event->resource->is_eol_flickr_group() && self::is_this_flickr_image_in_inaturalist($row)) {
         return false;
     }
     $data_object = new DataObject();
     $data_object->identifier = @self::field_decode($row['http://purl.org/dc/terms/identifier']);
     if (isset($this->media_ids_inserted[$data_object->identifier])) {
         return false;
     }
     $data_object->data_type = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://purl.org/dc/terms/type']));
     if ($dt = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://rs.tdwg.org/audubon_core/subtype']))) {
         $data_object->data_subtype_id = $dt->id;
     }
     $data_object->mime_type = MimeType::find_or_create_by_translated_label(@self::field_decode($row['http://purl.org/dc/terms/format']));
     $data_object->object_created_at = @self::field_decode($row['http://ns.adobe.com/xap/1.0/CreateDate']);
     $data_object->object_modified_at = @self::field_decode($row['http://purl.org/dc/terms/modified']);
     $data_object->available_at = @self::field_decode($row['http://purl.org/dc/terms/available']);
     $data_object->object_title = @self::field_decode($row['http://purl.org/dc/terms/title']);
     $data_object->language = Language::find_or_create_for_parser(@self::field_decode($row['http://purl.org/dc/terms/language']));
     // check multiple fields for a value of license
     if (isset($row['http://purl.org/dc/terms/license'])) {
         $license_string = @self::field_decode($row['http://purl.org/dc/terms/license']);
     } else {
         $license_string = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/UsageTerms']);
     }
     // convert British licences to American licenses
     $license_string = str_replace("creativecommons.org/licences/", "creativecommons.org/licenses/", $license_string);
     if (!$license_string && $this->harvest_event->resource->license && $this->harvest_event->resource->license->source_url) {
         $license_string = $this->harvest_event->resource->license->source_url;
     }
     if (!$license_string || !\eol_schema\MediaResource::valid_license($license_string)) {
         return false;
     }
     $data_object->license = License::find_or_create_for_parser($license_string);
     $data_object->rights_statement = @self::field_decode($row['http://purl.org/dc/terms/rights']);
     $data_object->rights_holder = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/Owner']);
     $data_object->bibliographic_citation = @self::field_decode($row['http://purl.org/dc/terms/bibliographicCitation']);
     $data_object->source_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/furtherInformationURL']);
     $data_object->derived_from = @self::field_decode($row['http://rs.tdwg.org/ac/terms/derivedFrom']);
     $data_object->description = @self::field_decode($row['http://purl.org/dc/terms/description']);
     // Turn newlines into paragraphs
     $data_object->description = str_replace("\n", "</p><p>", $data_object->description);
     $data_object->object_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/accessURI']);
     $data_object->thumbnail_url = @self::field_decode($row['http://eol.org/schema/media/thumbnailURL']);
     $data_object->location = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/LocationCreated']);
     $data_object->spatial_location = @self::field_decode($row['http://purl.org/dc/terms/spatial']);
     $data_object->latitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#lat']);
     $data_object->longitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#long']);
     $data_object->altitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#alt']);
     $rating = @self::field_decode($row['http://ns.adobe.com/xap/1.0/Rating']);
     // ratings may be 0 to 5
     // TODO: technically 0 means untrusted, and then anywhere from 1-5 is OK.
     // 0.5 for example isn't really valid acording to the schema
     if (is_numeric($rating) && $rating > 0 && $rating <= 5) {
         $data_object->data_rating = $rating;
     }
     //TODO - update this
     if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) {
         $data_object->data_type = DataType::youtube();
         $data_object->data_type_id = DataType::youtube()->id;
     }
     // //take the first available source_url of one of this object's taxa
     if (!@$data_object->source_url && @$taxon_parameters["source_url"]) {
         foreach ($object_taxon_info as $taxon_info) {
             if ($source_url = $taxon_info['source_url']) {
                 $data_object->source_url = $source_url;
                 break;
             }
         }
     }
     /* Checking requirements */
     // if text: must have description
     if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) {
         return false;
     }
     // if image, movie or sound: must have object_url
     if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) {
         return false;
     }
     /* ADDING THE DATA OBJECT */
     list($data_object, $status) = DataObject::find_and_compare($this->harvest_event->resource, $data_object, $this->content_manager);
     if (@(!$data_object->id)) {
         return false;
     }
     $this->media_ids_inserted[$data_object->identifier] = $data_object->id;
     $this->harvest_event->add_data_object($data_object, $status);
     $data_object->delete_hierarchy_entries();
     $vetted_id = Vetted::unknown()->id;
     $visibility_id = Visibility::preview()->id;
     foreach ($object_taxon_info as $taxon_info) {
         $he_id = $taxon_info['hierarchy_entry_id'];
         $tc_id = $taxon_info['taxon_concept_id'];
         $this->mysqli->insert("INSERT IGNORE INTO data_objects_hierarchy_entries (hierarchy_entry_id, data_object_id, vetted_id, visibility_id) VALUES ({$he_id}, {$data_object->id}, {$vetted_id}, {$visibility_id})");
         $this->mysqli->insert("INSERT IGNORE INTO data_objects_taxon_concepts (taxon_concept_id, data_object_id) VALUES ({$tc_id}, {$data_object->id})");
     }
     // a few things to add after the DataObject is inserted
     // keep track of reference foreign keys
     self::append_foreign_keys_from_row($row, 'http://eol.org/schema/reference/referenceID', $this->media_reference_ids, $data_object->id, $data_object->guid);
     // keep track of agent foreign keys
     self::append_foreign_keys_from_row($row, 'http://eol.org/schema/agent/agentID', $this->media_agent_ids, $data_object->id);
     $data_object->delete_info_items();
     $data_object->delete_table_of_contents();
     if ($s = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) {
         $ii = InfoItem::find_or_create_by_schema_value($s);
         $data_object->add_info_item($ii->id);
         unset($ii);
     }
     if ($a = @self::field_decode($row['http://purl.org/dc/terms/audience'])) {
         $a = Audience::find_or_create_by_translated_label(trim((string) $a));
         $data_object->add_audience($a->id);
         unset($a);
     }
     $data_object_parameters["agents"] = array();
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/creator', 'Creator');
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/publisher', 'Publisher');
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/contributor', 'Contributor');
     $data_object->delete_agents();
     $i = 0;
     foreach ($data_object_parameters['agents'] as &$a) {
         $agent = Agent::find_or_create($a);
         if ($agent->logo_url && !$agent->logo_cache_url) {
             if ($logo_cache_url = $this->content_manager->grab_file($agent->logo_url, "partner")) {
                 $agent->logo_cache_url = $logo_cache_url;
                 $agent->save();
             }
         }
         $data_object->add_agent($agent->id, @$a['agent_role']->id ?: 0, $i);
         unset($a);
         $i++;
     }
     if (!isset($this->object_references_deleted[$data_object->id])) {
         $data_object->delete_refs();
         $this->object_references_deleted[$data_object->id] = true;
     }
     // add data object info to resource contribution
     if ($status != "Unchanged") {
         $result = $this->mysqli->query("SELECT id, source_url, taxon_concept_id, hierarchy_id, identifier FROM hierarchy_entries inner join  data_objects_hierarchy_entries on hierarchy_entries.id = data_objects_hierarchy_entries.hierarchy_entry_id where data_object_id =" . $data_object->id);
         if ($result && ($row = $result->fetch_assoc())) {
             $hierarchy_entry_id = $row["id"];
             $source = "'" . $this->get_hierarchy_entry_outlink($row["hierarchy_id"], $row["identifier"], preg_replace('/\'/', "\\'", $row["source_url"])) . "'";
             $identifier = "'" . $row["identifier"] . "'";
             $taxon_concept_id = $row["taxon_concept_id"];
         }
         $resource_id = $this->harvest_event->resource_id;
         $this->mysqli->insert("INSERT IGNORE INTO resource_contributions (resource_id, data_object_id, data_point_uri_id, hierarchy_entry_id, taxon_concept_id, source, object_type, identifier, data_object_type) VALUES ({$resource_id}, {$data_object->id}, NULL, {$hierarchy_entry_id}, {$taxon_concept_id}, {$source}, 'data_object', {$identifier}, {$data_object->data_type_id})");
     }
 }
 public function validate_row($row, $parameters)
 {
     static $i = 0;
     $i++;
     if ($i % 10000 == 0 && $GLOBALS['ENV_DEBUG']) {
         echo "{$i}: " . time_elapsed() . " :: " . memory_get_usage() . "\n";
         write_to_resource_harvesting_log($i . ": " . time_elapsed() . "::" . memory_get_usage());
     }
     $file_location = $parameters['archive_table_definition']->location;
     $new_exceptions = array();
     if ($parameters['row_type'] == 'http://eol.org/schema/media/document') {
         if (@(!$row['http://purl.org/dc/terms/license']) && @(!$row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'])) {
             if ($this->archive_resource && $this->archive_resource->license && $this->archive_resource->license->source_url) {
                 $row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'] = $this->archive_resource->license->source_url;
                 unset($row['http://purl.org/dc/terms/license']);
             }
         }
         $new_exceptions = \eol_schema\MediaResource::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions);
         if (!self::any_exceptions_of_type_error($new_exceptions)) {
             if (@($v = $row['http://purl.org/dc/terms/type'])) {
                 $this->add_stat('type', $parameters['row_type'], $file_location, $v);
             }
             if (@($v = $row['http://rs.tdwg.org/audubon_core/subtype'])) {
                 $this->add_stat('subtype', $parameters['row_type'], $file_location, $v);
             }
             if (@($v = $row['http://ns.adobe.com/xap/1.0/rights/UsageTerms'])) {
                 $this->add_stat('license', $parameters['row_type'], $file_location, $v);
             }
             if (@($v = $row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) {
                 $this->add_stat('subject', $parameters['row_type'], $file_location, $v);
             }
             if (@($v = $row['http://purl.org/dc/terms/language'])) {
                 $this->add_stat('language', $parameters['row_type'], $file_location, $v);
             }
             if (@($v = $row['http://purl.org/dc/terms/format'])) {
                 $this->add_stat('format', $parameters['row_type'], $file_location, $v);
             }
         }
     } elseif ($parameters['row_type'] == 'http://rs.tdwg.org/dwc/terms/taxon') {
         $new_exceptions = \eol_schema\Taxon::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://rs.tdwg.org/dwc/terms/taxonID', $parameters, $new_exceptions);
     } elseif ($parameters['row_type'] == 'http://rs.gbif.org/terms/1.0/vernacularname') {
         $new_exceptions = \eol_schema\VernacularName::validate_by_hash($row, $this->skip_warnings);
         if (!self::any_exceptions_of_type_error($new_exceptions)) {
             if (@($v = $row['http://purl.org/dc/terms/language'])) {
                 $this->add_stat('language', $parameters['row_type'], $file_location, $v);
             }
         }
     } elseif ($parameters['row_type'] == 'http://eol.org/schema/reference/reference') {
         $new_exceptions = \eol_schema\Reference::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions);
     } elseif ($parameters['row_type'] == 'http://eol.org/schema/agent/agent') {
         $new_exceptions = \eol_schema\Agent::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://purl.org/dc/terms/identifier', $parameters, $new_exceptions);
     } elseif ($parameters['row_type'] == 'http://rs.tdwg.org/dwc/terms/measurementorfact') {
         $new_exceptions = \eol_schema\MeasurementOrFact::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://rs.tdwg.org/dwc/terms/measurementID', $parameters, $new_exceptions);
     } elseif ($parameters['row_type'] == 'http://eol.org/schema/association') {
         $new_exceptions = \eol_schema\Association::validate_by_hash($row, $this->skip_warnings);
         $this->append_identifier_error($row, 'http://eol.org/schema/associationID', $parameters, $new_exceptions);
     }
     if (!self::any_exceptions_of_type_error($new_exceptions)) {
         if (!isset($this->stats[$parameters['row_type']])) {
             $this->stats[$parameters['row_type']] = array();
         }
         if (!isset($this->stats[$parameters['row_type']]['Total'])) {
             $this->stats[$parameters['row_type']]['Total'] = 0;
         }
         $this->stats[$parameters['row_type']]['Total']++;
     }
     if ($new_exceptions) {
         foreach ($new_exceptions as $exception) {
             $exception->file = $parameters['archive_table_definition']->location;
             $exception->line = $parameters['archive_line_number'];
             if (get_class($exception) == 'eol_schema\\ContentArchiveError') {
                 if (!isset($this->errors_by_line[$parameters['row_type']][$file_location][$exception->line])) {
                     $this->errors_by_line[$parameters['row_type']][$file_location][$exception->line] = array();
                 }
                 $this->errors_by_line[$parameters['row_type']][$file_location][$exception->line][] = $exception;
             } elseif (!$this->skip_warnings) {
                 if (!isset($this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line])) {
                     $this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line] = array();
                 }
                 $this->warnings_by_line[$parameters['row_type']][$file_location][$exception->line][] = $exception;
             }
         }
     }
 }