Example #1
0
 public function begin_process()
 {
     // create and flush the temporary tables
     $this->mysqli->insert("CREATE TABLE IF NOT EXISTS top_images_tmp LIKE top_images");
     $this->mysqli->delete("TRUNCATE TABLE top_images_tmp");
     $this->mysqli->insert("CREATE TABLE IF NOT EXISTS top_unpublished_images_tmp LIKE top_unpublished_images");
     $this->mysqli->delete("TRUNCATE TABLE top_unpublished_images_tmp");
     $this->all_top_images = array();
     $query = "SELECT DISTINCT dotc.data_object_id, do.data_rating, dohe.visibility_id, dohe.vetted_id\r\n            FROM data_objects_taxon_concepts dotc\r\n            JOIN data_objects do ON (dotc.data_object_id=do.id)\r\n            JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)\r\n            WHERE do.data_type_id=" . DataType::image()->id . "\r\n            AND (do.published=1 OR dohe.visibility_id!=" . Visibility::visible()->id . ")";
     //AND do.id BETWEEN 11407274 AND 11507274
     $i = 0;
     $this->image_data_objects = array();
     foreach ($this->mysqli_slave->iterate_file($query) as $row_num => $row) {
         $data_object_id = $row[0];
         $data_rating = $row[1];
         $visibility_id = $row[2];
         $vetted_id = $row[3];
         $vetted_view_order = @$this->vetted_sort_orders[$vetted_id];
         if (!$vetted_view_order) {
             continue;
         }
         $this->image_data_objects[$data_object_id] = array('data_rating' => $data_rating, 'visibility_id' => $visibility_id, 'vetted_view_order' => $vetted_view_order);
         if ($i % 10000 == 0) {
             echo "{$i} " . memory_get_usage() . " " . time_elapsed() . "\n";
         }
         $i++;
     }
     echo "lookup_baseline_image_concepts " . memory_get_usage() . " " . time_elapsed() . "\n";
     $this->lookup_baseline_image_concepts();
     echo "lookup_hierarchy_entry_ids " . memory_get_usage() . " " . time_elapsed() . "\n";
     $this->lookup_hierarchy_entry_ids();
     // add in these objects DIRECTLY linked to concepts
     echo "insert_baseline_objects " . memory_get_usage() . " " . time_elapsed() . "\n";
     $this->insert_baseline_objects();
     // now start the search of the parents of these concepts
     echo "start_process_parents " . memory_get_usage() . " " . time_elapsed() . "\n";
     $this->start_process_parents();
     // finalize the import, clean up, move temp tables to real tables
     echo "end_load_data " . memory_get_usage() . " " . time_elapsed() . "\n";
     $this->end_load_data();
 }
 public function insert_data_object($row, $parameters)
 {
     self::debug_iterations("Inserting DataObject");
     $this->commit_iterations("DataObject", 20);
     if ($this->archive_validator->has_error_by_line('http://eol.org/schema/media/document', $parameters['archive_table_definition']->location, $parameters['archive_line_number'])) {
         write_to_resource_harvesting_log("ERROR: insert_data_object: has_error_by_line" . ",file_location:" . $parameters['archive_table_definition']->location . ",line_number:" . $parameters['archive_line_number']);
         return false;
     }
     $object_taxon_ids = self::get_foreign_keys_from_row($row, 'http://rs.tdwg.org/dwc/terms/taxonID');
     $object_taxon_info = array();
     if ($object_taxon_ids) {
         foreach ($object_taxon_ids as $taxon_id) {
             if ($taxon_info = @$this->taxon_ids_inserted[$taxon_id]) {
                 self::uncompress_array($taxon_info);
                 $object_taxon_info[] = $taxon_info;
             }
         }
     }
     if (!$object_taxon_info) {
         return false;
     }
     if ($this->harvest_event->resource->is_eol_flickr_group() && self::is_this_flickr_image_in_inaturalist($row)) {
         return false;
     }
     $data_object = new DataObject();
     $data_object->identifier = @self::field_decode($row['http://purl.org/dc/terms/identifier']);
     if (isset($this->media_ids_inserted[$data_object->identifier])) {
         return false;
     }
     $data_object->data_type = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://purl.org/dc/terms/type']));
     if ($dt = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://rs.tdwg.org/audubon_core/subtype']))) {
         $data_object->data_subtype_id = $dt->id;
     }
     $data_object->mime_type = MimeType::find_or_create_by_translated_label(@self::field_decode($row['http://purl.org/dc/terms/format']));
     $data_object->object_created_at = @self::field_decode($row['http://ns.adobe.com/xap/1.0/CreateDate']);
     $data_object->object_modified_at = @self::field_decode($row['http://purl.org/dc/terms/modified']);
     $data_object->available_at = @self::field_decode($row['http://purl.org/dc/terms/available']);
     $data_object->object_title = @self::field_decode($row['http://purl.org/dc/terms/title']);
     $data_object->language = Language::find_or_create_for_parser(@self::field_decode($row['http://purl.org/dc/terms/language']));
     // check multiple fields for a value of license
     if (isset($row['http://purl.org/dc/terms/license'])) {
         $license_string = @self::field_decode($row['http://purl.org/dc/terms/license']);
     } else {
         $license_string = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/UsageTerms']);
     }
     // convert British licences to American licenses
     $license_string = str_replace("creativecommons.org/licences/", "creativecommons.org/licenses/", $license_string);
     if (!$license_string && $this->harvest_event->resource->license && $this->harvest_event->resource->license->source_url) {
         $license_string = $this->harvest_event->resource->license->source_url;
     }
     if (!$license_string || !\eol_schema\MediaResource::valid_license($license_string)) {
         return false;
     }
     $data_object->license = License::find_or_create_for_parser($license_string);
     $data_object->rights_statement = @self::field_decode($row['http://purl.org/dc/terms/rights']);
     $data_object->rights_holder = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/Owner']);
     $data_object->bibliographic_citation = @self::field_decode($row['http://purl.org/dc/terms/bibliographicCitation']);
     $data_object->source_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/furtherInformationURL']);
     $data_object->derived_from = @self::field_decode($row['http://rs.tdwg.org/ac/terms/derivedFrom']);
     $data_object->description = @self::field_decode($row['http://purl.org/dc/terms/description']);
     // Turn newlines into paragraphs
     $data_object->description = str_replace("\n", "</p><p>", $data_object->description);
     $data_object->object_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/accessURI']);
     $data_object->thumbnail_url = @self::field_decode($row['http://eol.org/schema/media/thumbnailURL']);
     $data_object->location = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/LocationCreated']);
     $data_object->spatial_location = @self::field_decode($row['http://purl.org/dc/terms/spatial']);
     $data_object->latitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#lat']);
     $data_object->longitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#long']);
     $data_object->altitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#alt']);
     $rating = @self::field_decode($row['http://ns.adobe.com/xap/1.0/Rating']);
     // ratings may be 0 to 5
     // TODO: technically 0 means untrusted, and then anywhere from 1-5 is OK.
     // 0.5 for example isn't really valid acording to the schema
     if (is_numeric($rating) && $rating > 0 && $rating <= 5) {
         $data_object->data_rating = $rating;
     }
     //TODO - update this
     if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) {
         $data_object->data_type = DataType::youtube();
         $data_object->data_type_id = DataType::youtube()->id;
     }
     // //take the first available source_url of one of this object's taxa
     if (!@$data_object->source_url && @$taxon_parameters["source_url"]) {
         foreach ($object_taxon_info as $taxon_info) {
             if ($source_url = $taxon_info['source_url']) {
                 $data_object->source_url = $source_url;
                 break;
             }
         }
     }
     /* Checking requirements */
     // if text: must have description
     if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) {
         return false;
     }
     // if image, movie or sound: must have object_url
     if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) {
         return false;
     }
     /* ADDING THE DATA OBJECT */
     list($data_object, $status) = DataObject::find_and_compare($this->harvest_event->resource, $data_object, $this->content_manager);
     if (@(!$data_object->id)) {
         return false;
     }
     $this->media_ids_inserted[$data_object->identifier] = $data_object->id;
     $this->harvest_event->add_data_object($data_object, $status);
     $data_object->delete_hierarchy_entries();
     $vetted_id = Vetted::unknown()->id;
     $visibility_id = Visibility::preview()->id;
     foreach ($object_taxon_info as $taxon_info) {
         $he_id = $taxon_info['hierarchy_entry_id'];
         $tc_id = $taxon_info['taxon_concept_id'];
         $this->mysqli->insert("INSERT IGNORE INTO data_objects_hierarchy_entries (hierarchy_entry_id, data_object_id, vetted_id, visibility_id) VALUES ({$he_id}, {$data_object->id}, {$vetted_id}, {$visibility_id})");
         $this->mysqli->insert("INSERT IGNORE INTO data_objects_taxon_concepts (taxon_concept_id, data_object_id) VALUES ({$tc_id}, {$data_object->id})");
     }
     // a few things to add after the DataObject is inserted
     // keep track of reference foreign keys
     self::append_foreign_keys_from_row($row, 'http://eol.org/schema/reference/referenceID', $this->media_reference_ids, $data_object->id, $data_object->guid);
     // keep track of agent foreign keys
     self::append_foreign_keys_from_row($row, 'http://eol.org/schema/agent/agentID', $this->media_agent_ids, $data_object->id);
     $data_object->delete_info_items();
     $data_object->delete_table_of_contents();
     if ($s = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) {
         $ii = InfoItem::find_or_create_by_schema_value($s);
         $data_object->add_info_item($ii->id);
         unset($ii);
     }
     if ($a = @self::field_decode($row['http://purl.org/dc/terms/audience'])) {
         $a = Audience::find_or_create_by_translated_label(trim((string) $a));
         $data_object->add_audience($a->id);
         unset($a);
     }
     $data_object_parameters["agents"] = array();
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/creator', 'Creator');
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/publisher', 'Publisher');
     self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/contributor', 'Contributor');
     $data_object->delete_agents();
     $i = 0;
     foreach ($data_object_parameters['agents'] as &$a) {
         $agent = Agent::find_or_create($a);
         if ($agent->logo_url && !$agent->logo_cache_url) {
             if ($logo_cache_url = $this->content_manager->grab_file($agent->logo_url, "partner")) {
                 $agent->logo_cache_url = $logo_cache_url;
                 $agent->save();
             }
         }
         $data_object->add_agent($agent->id, @$a['agent_role']->id ?: 0, $i);
         unset($a);
         $i++;
     }
     if (!isset($this->object_references_deleted[$data_object->id])) {
         $data_object->delete_refs();
         $this->object_references_deleted[$data_object->id] = true;
     }
     // add data object info to resource contribution
     if ($status != "Unchanged") {
         $result = $this->mysqli->query("SELECT id, source_url, taxon_concept_id, hierarchy_id, identifier FROM hierarchy_entries inner join  data_objects_hierarchy_entries on hierarchy_entries.id = data_objects_hierarchy_entries.hierarchy_entry_id where data_object_id =" . $data_object->id);
         if ($result && ($row = $result->fetch_assoc())) {
             $hierarchy_entry_id = $row["id"];
             $source = "'" . $this->get_hierarchy_entry_outlink($row["hierarchy_id"], $row["identifier"], preg_replace('/\'/', "\\'", $row["source_url"])) . "'";
             $identifier = "'" . $row["identifier"] . "'";
             $taxon_concept_id = $row["taxon_concept_id"];
         }
         $resource_id = $this->harvest_event->resource_id;
         $this->mysqli->insert("INSERT IGNORE INTO resource_contributions (resource_id, data_object_id, data_point_uri_id, hierarchy_entry_id, taxon_concept_id, source, object_type, identifier, data_object_type) VALUES ({$resource_id}, {$data_object->id}, NULL, {$hierarchy_entry_id}, {$taxon_concept_id}, {$source}, 'data_object', {$identifier}, {$data_object->data_type_id})");
     }
 }
Example #3
0
 public static function read_taxon_xml($t, $resource)
 {
     $t_dc = $t->children("http://purl.org/dc/elements/1.1/");
     $t_dcterms = $t->children("http://purl.org/dc/terms/");
     $t_dwc = $t->children("http://rs.tdwg.org/dwc/dwcore/");
     $taxon_parameters = array();
     $taxon_parameters["identifier"] = Functions::import_decode($t_dc->identifier);
     $taxon_parameters["source_url"] = Functions::import_decode($t_dc->source);
     $taxon_parameters["kingdom"] = Functions::import_decode($t_dwc->Kingdom);
     $taxon_parameters["phylum"] = Functions::import_decode($t_dwc->Phylum);
     $taxon_parameters["class"] = Functions::import_decode($t_dwc->Class);
     $taxon_parameters["order"] = Functions::import_decode($t_dwc->Order);
     $taxon_parameters["family"] = Functions::import_decode($t_dwc->Family);
     $taxon_parameters["genus"] = Functions::import_decode($t_dwc->Genus);
     $taxon_parameters["scientific_name"] = Functions::import_decode($t_dwc->ScientificName);
     $taxon_parameters["rank"] = Rank::find_or_create_by_translated_label(Functions::import_decode($t->rank));
     $taxon_parameters["taxon_created_at"] = trim($t_dcterms->created);
     $taxon_parameters["taxon_modified_at"] = trim($t_dcterms->modified);
     if ($taxon_parameters["scientific_name"]) {
         $taxon_parameters["name"] = Name::find_or_create_by_string($taxon_parameters["scientific_name"]);
     } else {
         if ($name = $taxon_parameters["genus"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["genus"] = "";
         } elseif ($name = $taxon_parameters["family"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["family"] = "";
         } elseif ($name = $taxon_parameters["order"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["order"] = "";
         } elseif ($name = $taxon_parameters["class"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["class"] = "";
         } elseif ($name = $taxon_parameters["phylum"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["phylum"] = "";
         } elseif ($name = $taxon_parameters["kingdom"]) {
             $taxon_parameters["scientific_name"] = $name;
             $taxon_parameters["name"] = Name::find_or_create_by_string($name);
             $taxon_parameters["kingdom"] = "";
         } else {
             return;
         }
     }
     $taxon_parameters["common_names"] = array();
     foreach ($t->commonName as $c) {
         $common_name = Functions::import_decode((string) $c);
         if (!$common_name) {
             continue;
         }
         $xml_attr = $c->attributes("http://www.w3.org/XML/1998/namespace");
         $params = array("name" => $common_name, "language" => Language::find_or_create_for_parser(@Functions::import_decode($xml_attr["lang"])));
         $taxon_parameters["common_names"][] = $params;
     }
     $taxon_parameters["synonyms"] = array();
     foreach ($t->synonym as $s) {
         $synonym = Functions::import_decode((string) $s);
         if (!$synonym) {
             continue;
         }
         $attr = $s->attributes();
         if (!@$attr["relationship"]) {
             $attr["relationship"] = 'synonym';
         }
         $params = array("name" => Name::find_or_create_by_string($synonym), "synonym_relation" => SynonymRelation::find_or_create_by_translated_label(trim($attr["relationship"])));
         $taxon_parameters["synonyms"][] = $params;
     }
     $taxon_parameters["agents"] = array();
     foreach ($t->agent as $a) {
         $agent_name = Functions::import_decode((string) $a);
         if (!$agent_name) {
             continue;
         }
         $attr = $a->attributes();
         $params = array("full_name" => Functions::import_decode((string) $a, 0, 0), "homepage" => @Functions::import_decode($attr["homepage"]), "logo_url" => @Functions::import_decode($attr["logoURL"]), "agent_role" => AgentRole::find_or_create_by_translated_label(@trim($attr["role"])));
         $taxon_parameters["agents"][] = $params;
         unset($params);
     }
     $taxon_parameters["refs"] = array();
     foreach ($t->reference as $r) {
         $reference = Functions::import_decode((string) $r, 0, 0);
         if (!$reference) {
             continue;
         }
         $ref = Reference::find_or_create_by_full_reference($reference);
         $taxon_parameters["refs"][] = $ref;
         $id_labels = array("bici", "coden", "doi", "eissn", "handle", "issn", "isbn", "lsid", "oclc", "sici", "url", "urn");
         $attr = $r->attributes();
         foreach ($id_labels as $label) {
             if ($id = @Functions::import_decode($attr[$label], 0, 0)) {
                 $type = RefIdentifierType::find_or_create_by_label($label);
                 $ref->add_ref_identifier(@$type->id ?: 0, $id);
             }
         }
     }
     $taxon_parameters["data_objects"] = array();
     foreach ($t->dataObject as $d) {
         $d_dc = $d->children("http://purl.org/dc/elements/1.1/");
         $d_dcterms = $d->children("http://purl.org/dc/terms/");
         $d_geo = $d->children("http://www.w3.org/2003/01/geo/wgs84_pos#");
         $data_object = new DataObject();
         $data_object->identifier = Functions::import_decode($d_dc->identifier);
         $data_object->data_type = DataType::find_or_create_by_schema_value(Functions::import_decode($d->dataType));
         $data_object->mime_type = MimeType::find_or_create_by_translated_label(Functions::import_decode($d->mimeType));
         $data_object->object_created_at = Functions::import_decode($d_dcterms->created);
         $data_object->object_modified_at = Functions::import_decode($d_dcterms->modified);
         $data_object->object_title = Functions::import_decode($d_dc->title, 0, 0);
         $data_object->language = Language::find_or_create_for_parser(Functions::import_decode($d_dc->language));
         $data_object->license = License::find_or_create_for_parser(Functions::import_decode($d->license));
         $data_object->rights_statement = Functions::import_decode($d_dc->rights, 0, 0);
         $data_object->rights_holder = Functions::import_decode($d_dcterms->rightsHolder, 0, 0);
         $data_object->bibliographic_citation = Functions::import_decode($d_dcterms->bibliographicCitation, 0, 0);
         $data_object->source_url = Functions::import_decode($d_dc->source);
         $data_object->description = Functions::import_decode($d_dc->description, 0, 0);
         $data_object->object_url = Functions::import_decode($d->mediaURL);
         $data_object->thumbnail_url = Functions::import_decode($d->thumbnailURL);
         $data_object->location = Functions::import_decode($d->location, 0, 0);
         if (@$d->additionalInformation) {
             $data_object->additional_information = (array) $d->additionalInformation;
         }
         if ($r = (string) @$d->additionalInformation->rating) {
             if (is_numeric($r) && $r > 0 && $r <= 5) {
                 $data_object->data_rating = $r;
             }
         }
         if ($subtype = @$d->additionalInformation->subtype) {
             if ($dt = DataType::find_or_create_by_schema_value(Functions::import_decode($subtype))) {
                 $data_object->data_subtype_id = $dt->id;
             }
         }
         $data_object_parameters = array();
         if (!$data_object->language) {
             $xml_attr = $d_dc->description->attributes("http://www.w3.org/XML/1998/namespace");
             $data_object->language = Language::find_or_create_for_parser(@Functions::import_decode($xml_attr["lang"]));
         }
         if (!$data_object->language && $resource->language) {
             $data_object->language = $resource->language;
         }
         //TODO - update this
         if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) {
             $data_object->data_type = DataType::youtube();
             $data_object->data_type_id = DataType::youtube()->id;
         }
         //take the taxon's source_url if none present
         if (!@$data_object->source_url && @$taxon_parameters["source_url"]) {
             $data_object->source_url = $taxon_parameters["source_url"];
         }
         // Turn newlines into paragraphs
         $data_object->description = str_replace("\n", "</p><p>", $data_object->description);
         /* Checking requirements*/
         //if text: must have description
         if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) {
             continue;
         }
         //if image, movie or sound: must have object_url
         if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) {
             continue;
         }
         $data_object->latitude = 0;
         $data_object->longitude = 0;
         $data_object->altitude = 0;
         foreach ($d_geo->Point as $p) {
             $p_geo = $p->children("http://www.w3.org/2003/01/geo/wgs84_pos#");
             $data_object->latitude = Functions::import_decode($p_geo->lat);
             $data_object->longitude = Functions::import_decode($p_geo->long);
             $data_object->altitude = Functions::import_decode($p_geo->alt);
         }
         $data_object_parameters["agents"] = array();
         foreach ($d->agent as $a) {
             $agent_name = Functions::import_decode((string) $a);
             if (!$agent_name) {
                 continue;
             }
             $attr = $a->attributes();
             $params = array("full_name" => Functions::import_decode((string) $a, 0, 0), "homepage" => @Functions::import_decode($attr["homepage"]), "logo_url" => @Functions::import_decode($attr["logoURL"]), "agent_role" => AgentRole::find_or_create_by_translated_label(@trim($attr["role"])));
             $data_object_parameters["agents"][] = $params;
             unset($params);
         }
         $data_object_parameters["audiences"] = array();
         foreach ($d->audience as $a) {
             $data_object_parameters["audiences"][] = Audience::find_or_create_by_translated_label(trim((string) $a));
         }
         $data_object_parameters["info_items"] = array();
         foreach ($d->subject as $s) {
             $data_object_parameters["info_items"][] = InfoItem::find_or_create_by_schema_value(trim((string) $s));
         }
         if ($subject = @$d->additionalInformation->subject) {
             if ($ii = InfoItem::find_or_create_by_schema_value(trim((string) $subject))) {
                 $data_object_parameters["info_items"] = array($ii);
             }
         }
         // EXCEPTIONS
         if ($data_object->is_text()) {
             if ($resource->title == "BOLD Systems Resource") {
                 // EXCEPTION - overriding the subject for BOLD
                 $data_object_parameters["info_items"] = array(InfoItem::find_or_create_by_schema_value('http://www.eol.org/voc/table_of_contents#Barcode'));
             } elseif ($resource->title == "Wikipedia") {
                 // EXCEPTION - overriding the subject for Wikipedia
                 $data_object_parameters["info_items"] = array(InfoItem::find_or_create_by_schema_value('http://www.eol.org/voc/table_of_contents#Wikipedia'));
             } elseif ($resource->title == "IUCN Red List") {
                 if ($data_object->object_title == "IUCNConservationStatus") {
                     // EXCEPTION - overriding the data type for IUCN text
                     $data_object->data_type_id = DataType::iucn()->id;
                     $data_object->data_type = DataType::iucn();
                 }
             }
         }
         $data_object_parameters["refs"] = array();
         foreach ($d->reference as $r) {
             $reference = Functions::import_decode((string) $r, 0, 0);
             if (!$reference) {
                 continue;
             }
             $ref = Reference::find_or_create_by_full_reference($reference);
             $data_object_parameters["refs"][] = $ref;
             $id_labels = array("bici", "coden", "doi", "eissn", "handle", "issn", "isbn", "lsid", "oclc", "sici", "url", "urn");
             $attr = $r->attributes();
             foreach ($id_labels as $label) {
                 if ($id = @Functions::import_decode($attr[$label], 0, 0)) {
                     $type = RefIdentifierType::find_or_create_by_label($label);
                     $ref->add_ref_identifier(@$type->id ?: 0, $id);
                 }
             }
         }
         $taxon_parameters["data_objects"][] = array($data_object, $data_object_parameters);
         unset($data_object);
     }
     return $taxon_parameters;
 }
 function get_data_objects_count($batch_size = 100000)
 {
     $image_id = DataType::image()->id;
     $text_id = DataType::text()->id;
     $video_id = DataType::video()->id;
     $sound_id = DataType::sound()->id;
     $flash_id = DataType::flash()->id;
     $youtube_id = DataType::youtube()->id;
     $iucn_id = DataType::iucn()->id;
     $data_type_label[$text_id] = 'text';
     $data_type_label[$video_id] = 'video';
     $data_type_label[$sound_id] = 'sound';
     $data_type_label[$flash_id] = 'flash';
     $data_type_label[$youtube_id] = 'youtube';
     $data_type_label[$iucn_id] = 'iucn';
     $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn");
     $trusted_id = Vetted::trusted()->id;
     $untrusted_id = Vetted::untrusted()->id;
     $unreviewed_id = Vetted::unknown()->id;
     $raw_stats = array();
     $concept_info_items = array();
     $concept_references = array();
     for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) {
         $this->print_status($i, $batch_size);
         $sql = "SELECT  do.guid,\n                            dotc.taxon_concept_id,\n                            do.data_type_id,\n                            doii.info_item_id,\n                            dor.ref_id,\n                            REPLACE(REPLACE(do.description, '\\\\n', ' '), '\\\\r', ' '),\n                            dohe.vetted_id,\n                            do.id\n            FROM data_objects_taxon_concepts dotc\n            STRAIGHT_JOIN data_objects do ON (dotc.data_object_id = do.id)\n            JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)\n            LEFT JOIN data_objects_info_items doii ON (do.id = doii.data_object_id)\n            LEFT JOIN data_objects_refs dor ON (do.id = dor.data_object_id)\n            WHERE do.published = 1 AND dohe.visibility_id = " . Visibility::visible()->id . " AND do.data_type_id != {$image_id}";
         if ($this->test_taxon_concept_ids) {
             $sql .= " AND dotc.taxon_concept_id IN (" . $this->test_taxon_concept_ids . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $counted_data_objects = array();
         foreach ($this->mysqli_slave->iterate_file($sql) as $row_number => $row) {
             $taxon_concept_id = trim($row[0]);
             $data_type_id = trim($row[1]);
             $info_item_id = trim($row[2]);
             $ref_id = trim($row[3]);
             $description = trim($row[4]);
             $vetted_id = trim($row[5]);
             $data_object_id = trim($row[6]);
             if (isset($counted_data_objects[$taxon_concept_id][$data_object_id])) {
                 continue;
             }
             $counted_data_objects[$taxon_concept_id][$data_object_id] = 1;
             $label = @$data_type_label[$data_type_id];
             $words_count = str_word_count(strip_tags($description), 0);
             @$raw_stats[$taxon_concept_id][$label]['total']++;
             @($raw_stats[$taxon_concept_id][$label]['total_w'] += $words_count);
             if ($vetted_id == $trusted_id) {
                 @$raw_stats[$taxon_concept_id][$label]['t']++;
                 @($raw_stats[$taxon_concept_id][$label]['t_w'] += $words_count);
             } elseif ($vetted_id == $untrusted_id) {
                 @$raw_stats[$taxon_concept_id][$label]['ut']++;
                 @($raw_stats[$taxon_concept_id][$label]['ut_w'] += $words_count);
             } elseif ($vetted_id == $unreviewed_id) {
                 @$raw_stats[$taxon_concept_id][$label]['ur']++;
                 @($raw_stats[$taxon_concept_id][$label]['ur_w'] += $words_count);
             }
             $concept_info_items[$taxon_concept_id][$info_item_id] = '';
             $concept_references[$taxon_concept_id][$ref_id] = '';
         }
         foreach ($raw_stats as $taxon_concept_id => $stats) {
             $new_value = "";
             # the stats need to go into the file in a certain order to be imported into the MySQL table
             foreach ($data_type_order_in_file as $data_type) {
                 $new_value = @$stats[$data_type]['total'];
                 $new_value .= "\t" . @$stats[$data_type]['t'];
                 $new_value .= "\t" . @$stats[$data_type]['ut'];
                 $new_value .= "\t" . @$stats[$data_type]['ur'];
                 $new_value .= "\t" . @$stats[$data_type]['total_w'];
                 $new_value .= "\t" . @$stats[$data_type]['t_w'];
                 $new_value .= "\t" . @$stats[$data_type]['ut_w'];
                 $new_value .= "\t" . @$stats[$data_type]['ur_w'];
             }
             $raw_stats[$taxon_concept_id] = $new_value;
         }
         $this->save_category_stats($raw_stats, "get_data_objects_count");
         $raw_stats = array();
         if ($this->test_taxon_concept_ids) {
             break;
         }
     }
     // $this->save_to_json_file($concept_info_items, "concept_info_items");
     // unset($concept_info_items);
     //
     // $this->save_to_json_file($concept_references, "concept_references");
     // unset($concept_references);
 }
 function get_data_objects_count($batch_size = 100000)
 {
     $time_start = time_elapsed();
     $concept_data_object_counts = array();
     $concept_data_object_maps = array();
     $concept_info_items = array();
     $concept_references = array();
     $image_id = DataType::image()->id;
     $map_id = DataType::map()->id;
     $text_id = DataType::text()->id;
     $video_id = DataType::video()->id;
     $sound_id = DataType::sound()->id;
     $flash_id = DataType::flash()->id;
     $youtube_id = DataType::youtube()->id;
     $iucn_id = DataType::iucn()->id;
     $data_type_label[$image_id] = 'image';
     $data_type_label[$sound_id] = 'sound';
     $data_type_label[$text_id] = 'text';
     $data_type_label[$video_id] = 'video';
     $data_type_label[$iucn_id] = 'iucn';
     $data_type_label[$flash_id] = 'flash';
     $data_type_label[$youtube_id] = 'youtube';
     $trusted_id = Vetted::trusted()->id;
     $untrusted_id = Vetted::untrusted()->id;
     $unreviewed_id = Vetted::unknown()->id;
     for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) {
         print "\n dataObjects, its infoItems, its references [2 of 14] {$i} \n";
         $sql = "SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, dohe.vetted_id, do.data_subtype_id\r\n                FROM data_objects_taxon_concepts dotc \r\n                JOIN data_objects do ON dotc.data_object_id = do.id \r\n                LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n                LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n                JOIN data_objects_hierarchy_entries dohe on do.id = dohe.data_object_id\r\n                WHERE do.published=1 AND dohe.visibility_id=" . Visibility::visible()->id . " AND dohe.vetted_id != {$untrusted_id} ";
         //." AND do.data_type_id <> $image_id "; this has to be removed to count maps
         if (isset($GLOBALS['test_taxon_concept_ids'])) {
             $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $sql .= "\r\n                UNION\r\n                SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, udo.vetted_id, do.data_subtype_id\r\n                    FROM data_objects_taxon_concepts dotc \r\n                    JOIN data_objects do ON dotc.data_object_id = do.id \r\n                    LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n                    LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n                    JOIN users_data_objects udo on do.id = udo.data_object_id\r\n                    WHERE do.published=1 AND udo.visibility_id=" . Visibility::visible()->id . "\r\n                ";
         if (isset($GLOBALS['test_taxon_concept_ids'])) {
             $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $outfile = $this->mysqli_slave->select_into_outfile($sql);
         $FILE = fopen($outfile, "r");
         if (!$FILE) {
             print "!! ERROR: Could not read {$outfile}";
             debug("!! ERROR: Could not read {$outfile}");
             return;
         }
         $num_rows = 0;
         while (!feof($FILE)) {
             if ($line = fgets($FILE)) {
                 $num_rows++;
                 $line = trim($line);
                 $fields = explode("\t", $line);
                 $tc_id = trim($fields[0]);
                 $data_type_id = trim($fields[1]);
                 $info_item_id = trim($fields[2]);
                 $ref_id = trim($fields[3]);
                 $description = trim($fields[4]);
                 $vetted_id = trim($fields[5]);
                 $data_subtype_id = trim($fields[6]);
                 $label = @$data_type_label[$data_type_id];
                 if ($data_subtype_id != $map_id) {
                     $words_count = str_word_count(strip_tags($description), 0);
                     @$concept_data_object_counts[$tc_id][$label]['total']++;
                     @($concept_data_object_counts[$tc_id][$label]['total_w'] += $words_count);
                     if ($vetted_id == $trusted_id) {
                         @$concept_data_object_counts[$tc_id][$label]['t']++;
                         @($concept_data_object_counts[$tc_id][$label]['t_w'] += $words_count);
                     } elseif ($vetted_id == $untrusted_id) {
                         @$concept_data_object_counts[$tc_id][$label]['ut']++;
                         @($concept_data_object_counts[$tc_id][$label]['ut_w'] += $words_count);
                     } elseif ($vetted_id == $unreviewed_id) {
                         @$concept_data_object_counts[$tc_id][$label]['ur']++;
                         @($concept_data_object_counts[$tc_id][$label]['ur_w'] += $words_count);
                     }
                     $concept_info_items[$tc_id][$info_item_id] = '';
                     $concept_references[$tc_id][$ref_id] = '';
                 } else {
                     @$concept_data_object_maps[$tc_id][$label]['total']++;
                     if ($vetted_id == $trusted_id) {
                         @$concept_data_object_maps[$tc_id][$label]['t']++;
                     } elseif ($vetted_id == $untrusted_id) {
                         @$concept_data_object_maps[$tc_id][$label]['ut']++;
                     } elseif ($vetted_id == $unreviewed_id) {
                         @$concept_data_object_maps[$tc_id][$label]['ur']++;
                     }
                 }
             }
         }
         fclose($FILE);
         unlink($outfile);
         print "\n num_rows: {$num_rows}";
     }
     self::save_to_json_file($concept_info_items, "concept_info_items");
     unset($concept_info_items);
     self::save_to_json_file($concept_references, "concept_references");
     unset($concept_references);
     //save map data to be accessed later
     self::save_to_json_file($concept_data_object_maps, "map_counts");
     unset($concept_data_object_maps);
     //convert associative array to a regular array
     $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn");
     foreach ($concept_data_object_counts as $taxon_concept_id => $taxon_object_counts) {
         $new_value = "";
         foreach ($data_type_order_in_file as $data_type) {
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['total'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['t'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['total_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['t_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur_w'];
         }
         $concept_data_object_counts[$taxon_concept_id] = $new_value;
     }
     print "\n get_data_objects_count():" . (time_elapsed() - $time_start) / 60 . " minutes";
     self::save_totals_to_cumulative_txt($concept_data_object_counts, "tpm_data_objects");
     unset($concept_data_object_counts);
 }
Example #6
0
 public function count_data_objects($data_type_id = null)
 {
     //JOIN hierarchy_entries he ON (dohe.hierarchy_entry_id=he.id)
     $sql = "SELECT COUNT(distinct do.guid) count FROM data_objects do JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id) WHERE do.published=1 AND dohe.visibility_id=" . $this->visible_id . " AND dohe.vetted_id!=" . $this->untrusted_id;
     if ($data_type_id[0] != DataType::map()->id) {
         if ($data_type_id) {
             $sql .= " AND do.data_type_id IN (" . implode(",", $data_type_id) . ") ";
         }
         if ($data_type_id[0] == DataType::image()->id) {
             $sql .= " AND do.data_subtype_id IS NULL";
         }
     } else {
         $sql .= " AND do.data_subtype_id = " . DataType::map()->id;
     }
     $result = $this->mysqli_slave->query($sql);
     if ($result && ($row = $result->fetch_assoc())) {
         return $row['count'];
     }
 }