public function __construct() { $this->mysqli =& $GLOBALS['mysqli_connection']; if ($GLOBALS['ENV_NAME'] == 'production' && environment_defined('slave')) { $this->mysqli_slave = load_mysql_environment('slave'); } else { $this->mysqli_slave =& $this->mysqli; } $this->sparql_client = SparqlClient::connection(); $this->published_id = TranslatedResourceStatus::find_or_create_by_label('Published')->id; $this->trusted_id = Vetted::trusted()->id; $this->unknown_id = Vetted::unknown()->id; $this->untrusted_id = Vetted::untrusted()->id; $this->visible_id = Visibility::visible()->id; $this->invisible_id = Visibility::invisible()->id; $this->master_curator_id = CuratorLevel::master_curator()->id; $this->full_curator_id = CuratorLevel::full_curator()->id; $this->assistant_curator_id = CuratorLevel::assistant_curator()->id; $this->curator_ids = CuratorLevel::curator_ids(); $this->data_object_scope = ChangeableObjectType::data_object_scope(); $this->worms_content_partner_id = ContentPartner::find_or_create_by_full_name('World Register of Marine Species')->id; $this->col_hierarchy_id = Hierarchy::find_or_create_by_label('Species 2000 & ITIS Catalogue of Life: Annual Checklist 2011')->id; $this->latest_harvest_event_ids(); $this->worms_latest_harvest_event_id(); }
public function __construct() { $this->mysqli =& $GLOBALS['db_connection']; if ($GLOBALS['ENV_NAME'] == 'production' && environment_defined('slave')) { $this->mysqli_slave = load_mysql_environment('slave'); } else { $this->mysqli_slave =& $this->mysqli; } $this->vetted_sort_orders = array(); $this->vetted_sort_orders[Vetted::trusted()->id] = 1; $this->vetted_sort_orders[Vetted::unknown()->id] = 2; $this->vetted_sort_orders[Vetted::untrusted()->id] = 3; }
public function insert_data_object($row, $parameters) { self::debug_iterations("Inserting DataObject"); $this->commit_iterations("DataObject", 20); if ($this->archive_validator->has_error_by_line('http://eol.org/schema/media/document', $parameters['archive_table_definition']->location, $parameters['archive_line_number'])) { write_to_resource_harvesting_log("ERROR: insert_data_object: has_error_by_line" . ",file_location:" . $parameters['archive_table_definition']->location . ",line_number:" . $parameters['archive_line_number']); return false; } $object_taxon_ids = self::get_foreign_keys_from_row($row, 'http://rs.tdwg.org/dwc/terms/taxonID'); $object_taxon_info = array(); if ($object_taxon_ids) { foreach ($object_taxon_ids as $taxon_id) { if ($taxon_info = @$this->taxon_ids_inserted[$taxon_id]) { self::uncompress_array($taxon_info); $object_taxon_info[] = $taxon_info; } } } if (!$object_taxon_info) { return false; } if ($this->harvest_event->resource->is_eol_flickr_group() && self::is_this_flickr_image_in_inaturalist($row)) { return false; } $data_object = new DataObject(); $data_object->identifier = @self::field_decode($row['http://purl.org/dc/terms/identifier']); if (isset($this->media_ids_inserted[$data_object->identifier])) { return false; } $data_object->data_type = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://purl.org/dc/terms/type'])); if ($dt = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://rs.tdwg.org/audubon_core/subtype']))) { $data_object->data_subtype_id = $dt->id; } $data_object->mime_type = MimeType::find_or_create_by_translated_label(@self::field_decode($row['http://purl.org/dc/terms/format'])); $data_object->object_created_at = @self::field_decode($row['http://ns.adobe.com/xap/1.0/CreateDate']); $data_object->object_modified_at = @self::field_decode($row['http://purl.org/dc/terms/modified']); $data_object->available_at = @self::field_decode($row['http://purl.org/dc/terms/available']); $data_object->object_title = @self::field_decode($row['http://purl.org/dc/terms/title']); $data_object->language = Language::find_or_create_for_parser(@self::field_decode($row['http://purl.org/dc/terms/language'])); // check multiple fields for a value of license if (isset($row['http://purl.org/dc/terms/license'])) { $license_string = @self::field_decode($row['http://purl.org/dc/terms/license']); } else { $license_string = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/UsageTerms']); } // convert British licences to American licenses $license_string = str_replace("creativecommons.org/licences/", "creativecommons.org/licenses/", $license_string); if (!$license_string && $this->harvest_event->resource->license && $this->harvest_event->resource->license->source_url) { $license_string = $this->harvest_event->resource->license->source_url; } if (!$license_string || !\eol_schema\MediaResource::valid_license($license_string)) { return false; } $data_object->license = License::find_or_create_for_parser($license_string); $data_object->rights_statement = @self::field_decode($row['http://purl.org/dc/terms/rights']); $data_object->rights_holder = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/Owner']); $data_object->bibliographic_citation = @self::field_decode($row['http://purl.org/dc/terms/bibliographicCitation']); $data_object->source_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/furtherInformationURL']); $data_object->derived_from = @self::field_decode($row['http://rs.tdwg.org/ac/terms/derivedFrom']); $data_object->description = @self::field_decode($row['http://purl.org/dc/terms/description']); // Turn newlines into paragraphs $data_object->description = str_replace("\n", "</p><p>", $data_object->description); $data_object->object_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/accessURI']); $data_object->thumbnail_url = @self::field_decode($row['http://eol.org/schema/media/thumbnailURL']); $data_object->location = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/LocationCreated']); $data_object->spatial_location = @self::field_decode($row['http://purl.org/dc/terms/spatial']); $data_object->latitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#lat']); $data_object->longitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#long']); $data_object->altitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#alt']); $rating = @self::field_decode($row['http://ns.adobe.com/xap/1.0/Rating']); // ratings may be 0 to 5 // TODO: technically 0 means untrusted, and then anywhere from 1-5 is OK. // 0.5 for example isn't really valid acording to the schema if (is_numeric($rating) && $rating > 0 && $rating <= 5) { $data_object->data_rating = $rating; } //TODO - update this if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) { $data_object->data_type = DataType::youtube(); $data_object->data_type_id = DataType::youtube()->id; } // //take the first available source_url of one of this object's taxa if (!@$data_object->source_url && @$taxon_parameters["source_url"]) { foreach ($object_taxon_info as $taxon_info) { if ($source_url = $taxon_info['source_url']) { $data_object->source_url = $source_url; break; } } } /* Checking requirements */ // if text: must have description if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) { return false; } // if image, movie or sound: must have object_url if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) { return false; } /* ADDING THE DATA OBJECT */ list($data_object, $status) = DataObject::find_and_compare($this->harvest_event->resource, $data_object, $this->content_manager); if (@(!$data_object->id)) { return false; } $this->media_ids_inserted[$data_object->identifier] = $data_object->id; $this->harvest_event->add_data_object($data_object, $status); $data_object->delete_hierarchy_entries(); $vetted_id = Vetted::unknown()->id; $visibility_id = Visibility::preview()->id; foreach ($object_taxon_info as $taxon_info) { $he_id = $taxon_info['hierarchy_entry_id']; $tc_id = $taxon_info['taxon_concept_id']; $this->mysqli->insert("INSERT IGNORE INTO data_objects_hierarchy_entries (hierarchy_entry_id, data_object_id, vetted_id, visibility_id) VALUES ({$he_id}, {$data_object->id}, {$vetted_id}, {$visibility_id})"); $this->mysqli->insert("INSERT IGNORE INTO data_objects_taxon_concepts (taxon_concept_id, data_object_id) VALUES ({$tc_id}, {$data_object->id})"); } // a few things to add after the DataObject is inserted // keep track of reference foreign keys self::append_foreign_keys_from_row($row, 'http://eol.org/schema/reference/referenceID', $this->media_reference_ids, $data_object->id, $data_object->guid); // keep track of agent foreign keys self::append_foreign_keys_from_row($row, 'http://eol.org/schema/agent/agentID', $this->media_agent_ids, $data_object->id); $data_object->delete_info_items(); $data_object->delete_table_of_contents(); if ($s = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) { $ii = InfoItem::find_or_create_by_schema_value($s); $data_object->add_info_item($ii->id); unset($ii); } if ($a = @self::field_decode($row['http://purl.org/dc/terms/audience'])) { $a = Audience::find_or_create_by_translated_label(trim((string) $a)); $data_object->add_audience($a->id); unset($a); } $data_object_parameters["agents"] = array(); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/creator', 'Creator'); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/publisher', 'Publisher'); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/contributor', 'Contributor'); $data_object->delete_agents(); $i = 0; foreach ($data_object_parameters['agents'] as &$a) { $agent = Agent::find_or_create($a); if ($agent->logo_url && !$agent->logo_cache_url) { if ($logo_cache_url = $this->content_manager->grab_file($agent->logo_url, "partner")) { $agent->logo_cache_url = $logo_cache_url; $agent->save(); } } $data_object->add_agent($agent->id, @$a['agent_role']->id ?: 0, $i); unset($a); $i++; } if (!isset($this->object_references_deleted[$data_object->id])) { $data_object->delete_refs(); $this->object_references_deleted[$data_object->id] = true; } // add data object info to resource contribution if ($status != "Unchanged") { $result = $this->mysqli->query("SELECT id, source_url, taxon_concept_id, hierarchy_id, identifier FROM hierarchy_entries inner join data_objects_hierarchy_entries on hierarchy_entries.id = data_objects_hierarchy_entries.hierarchy_entry_id where data_object_id =" . $data_object->id); if ($result && ($row = $result->fetch_assoc())) { $hierarchy_entry_id = $row["id"]; $source = "'" . $this->get_hierarchy_entry_outlink($row["hierarchy_id"], $row["identifier"], preg_replace('/\'/', "\\'", $row["source_url"])) . "'"; $identifier = "'" . $row["identifier"] . "'"; $taxon_concept_id = $row["taxon_concept_id"]; } $resource_id = $this->harvest_event->resource_id; $this->mysqli->insert("INSERT IGNORE INTO resource_contributions (resource_id, data_object_id, data_point_uri_id, hierarchy_entry_id, taxon_concept_id, source, object_type, identifier, data_object_type) VALUES ({$resource_id}, {$data_object->id}, NULL, {$hierarchy_entry_id}, {$taxon_concept_id}, {$source}, 'data_object', {$identifier}, {$data_object->data_type_id})"); } }
public static function update_taxon_concept_names($taxon_concept_ids) { if (!$taxon_concept_ids) { return false; } if (is_numeric($taxon_concept_ids)) { $taxon_concept_ids = array($taxon_concept_ids); } $mysqli =& $GLOBALS['db_connection']; $started_new_transaction = false; if (!$mysqli->in_transaction()) { $mysqli->begin_transaction(); $started_new_transaction = true; } $batches = array_chunk($taxon_concept_ids, 500); foreach ($batches as $batch_ids) { usleep(500000); $name_ids = array(); $matching_ids = array(); $query = "\n (SELECT he.taxon_concept_id, he.id, he.name_id, 'preferred' as type FROM hierarchy_entries he WHERE taxon_concept_id IN (" . implode(",", $batch_ids) . ") AND ((he.published=1 AND he.visibility_id=" . Visibility::visible()->id . ") OR (he.published=0 AND he.visibility_id=" . Visibility::preview()->id . ")))\n UNION\n (SELECT he.taxon_concept_id, s.hierarchy_entry_id, s.name_id, 'synonym' as type\n FROM hierarchy_entries he\n JOIN synonyms s ON (he.id=s.hierarchy_entry_id)\n WHERE he.taxon_concept_id IN (" . implode(",", $batch_ids) . ")\n AND s.language_id=0\n AND s.synonym_relation_id!=" . SynonymRelation::find_or_create_by_translated_label('genbank common name')->id . "\n AND s.synonym_relation_id!=" . SynonymRelation::find_or_create_by_translated_label('common name')->id . "\n AND s.synonym_relation_id!=" . SynonymRelation::find_or_create_by_translated_label('blast name')->id . "\n AND s.synonym_relation_id!=" . SynonymRelation::find_or_create_by_translated_label('genbank acronym')->id . "\n AND s.synonym_relation_id!=" . SynonymRelation::find_or_create_by_translated_label('acronym')->id . "\n AND ((he.published=1 AND he.visibility_id=" . Visibility::visible()->id . ") OR (he.published=0 AND he.visibility_id=" . Visibility::preview()->id . ")))"; foreach ($mysqli->iterate_file($query) as $row_num => $row) { $taxon_concept_id = $row[0]; $hierarchy_entry_id = $row[1]; $name_id = $row[2]; $name_type = $row[3]; $name_ids[$name_id][$taxon_concept_id] = 1; $matching_ids[$taxon_concept_id][$name_id][$hierarchy_entry_id] = $name_type; } if ($name_ids) { //This makes sure we have a scientific name, gets the canonicalFormID $query = "SELECT n.id, n_match.id FROM names n JOIN canonical_forms cf ON (n.canonical_form_id=cf.id) JOIN names n_match ON (cf.id=n_match.canonical_form_id) WHERE n.id IN (" . implode(",", array_keys($name_ids)) . ") AND n_match.string=cf.string"; foreach ($mysqli->iterate_file($query) as $row_num => $row) { $original_name_id = $row[0]; $canonical_name_id = $row[1]; if ($original_name_id != $canonical_name_id) { foreach ($name_ids[$original_name_id] as $taxon_concept_id => $junk) { $matching_ids[$taxon_concept_id][$canonical_name_id][0] = 1; } } } } $common_names = array(); $preferred_in_language = array(); $query = "SELECT he.taxon_concept_id, he.published, he.visibility_id, s.id, s.hierarchy_id, s.hierarchy_entry_id, s.name_id, s.language_id, s.preferred, s.vetted_id FROM hierarchy_entries he JOIN synonyms s ON (he.id=s.hierarchy_entry_id) JOIN vetted v ON (s.vetted_id=v.id) WHERE he.taxon_concept_id IN (" . implode(",", $batch_ids) . ") AND s.language_id!=0 AND (s.synonym_relation_id=" . SynonymRelation::genbank_common_name()->id . " OR s.synonym_relation_id=" . SynonymRelation::common_name()->id . ") ORDER BY s.language_id, (s.hierarchy_id=" . Hierarchy::contributors()->id . ") DESC, v.view_order ASC, s.preferred DESC, s.id DESC"; foreach ($mysqli->iterate_file($query) as $row_num => $row) { $taxon_concept_id = $row[0]; $published = $row[1]; $visibility_id = $row[2]; $synonym_id = $row[3]; $hierarchy_id = $row[4]; $hierarchy_entry_id = $row[5]; $name_id = $row[6]; $language_id = $row[7]; $preferred = $row[8]; $vetted_id = $row[9]; // skipping Wikipedia common names entirely if ($hierarchy_id == @Hierarchy::wikipedia()->id) { continue; } $curator_name = $hierarchy_id == @Hierarchy::contributors()->id; $ubio_name = $hierarchy_id == @Hierarchy::ubio()->id; if ($curator_name || $ubio_name || $curator_name || $published == 1 && $visibility_id == Visibility::visible()->id) { if (isset($preferred_in_language[$taxon_concept_id][$language_id])) { $preferred = 0; } if ($preferred && $curator_name && ($vetted_id == Vetted::trusted()->id || $vetted_id == Vetted::unknown()->id)) { $preferred_in_language[$taxon_concept_id][$language_id] = 1; } else { $preferred = 0; } if (!isset($common_names[$taxon_concept_id])) { $common_names[$taxon_concept_id] = array(); } $common_names[$taxon_concept_id][] = array('synonym_id' => $synonym_id, 'language_id' => $language_id, 'name_id' => $name_id, 'hierarchy_entry_id' => $hierarchy_entry_id, 'preferred' => $preferred, 'vetted_id' => $vetted_id, 'is_curator_name' => $curator_name); } } // if there was no preferred name foreach ($common_names as $taxon_concept_id => $arr) { foreach ($arr as $key => $arr2) { if (@(!$preferred_in_language[$taxon_concept_id][$arr2['language_id']]) && ($arr2['vetted_id'] == Vetted::trusted()->id || $arr2['vetted_id'] == Vetted::unknown()->id)) { $common_names[$taxon_concept_id][$key]['preferred'] = 1; $preferred_in_language[$taxon_concept_id][$arr2['language_id']] = 1; } } } $mysqli->delete("DELETE FROM taxon_concept_names WHERE taxon_concept_id IN (" . implode(",", $batch_ids) . ")"); $tmp_file_path = temp_filepath(); if (!($LOAD_DATA_TEMP = fopen($tmp_file_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $tmp_file_path); return; } /* Insert the scientific names */ foreach ($matching_ids as $taxon_concept_id => $arr) { foreach ($arr as $name_id => $arr2) { foreach ($arr2 as $hierarchy_entry_id => $type) { $preferred = 0; if ($hierarchy_entry_id && $type == "preferred") { $preferred = 1; } fwrite($LOAD_DATA_TEMP, "{$taxon_concept_id}\t{$name_id}\t{$hierarchy_entry_id}\t0\t0\t{$preferred}\n"); } } } $mysqli->load_data_infile($tmp_file_path, 'taxon_concept_names'); unlink($tmp_file_path); $tmp_file_path = temp_filepath(); if (!($LOAD_DATA_TEMP = fopen($tmp_file_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $tmp_file_path); return; } /* Insert the common names */ foreach ($common_names as $taxon_concept_id => $arr) { foreach ($arr as $key => $arr2) { $synonym_id = $arr2['synonym_id']; $language_id = $arr2['language_id']; $name_id = $arr2['name_id']; $hierarchy_entry_id = $arr2['hierarchy_entry_id']; $preferred = $arr2['preferred']; $vetted_id = $arr2['vetted_id']; fwrite($LOAD_DATA_TEMP, "{$taxon_concept_id}\t{$name_id}\t{$hierarchy_entry_id}\t{$language_id}\t1\t{$preferred}\t{$synonym_id}\t{$vetted_id}\n"); } } $mysqli->load_data_infile($tmp_file_path, 'taxon_concept_names'); unlink($tmp_file_path); unset($matching_ids); unset($common_names); unset($name_ids); unset($preferred_in_language); $mysqli->commit(); } if ($started_new_transaction) { $mysqli->end_transaction(); } }
function add_data_object($hierarchy_entry, $options) { $d = $options[0]; $parameters = $options[1]; // Add default values from resource if (@(!$d->rights_statement) && $this->resource->rights_statement) { $d->rights_statement = $this->resource->rights_statement; } if (@(!$d->rights_holder) && $this->resource->rights_holder) { $d->rights_holder = $this->resource->rights_holder; } if (@(!$d->license_id) && $this->resource->license_id) { $d->license_id = $this->resource->license_id; } if (@(!$d->language_id) && $this->resource->language_id) { $d->language_id = $this->resource->language_id; } // print_r($d); list($data_object, $status, $existing_data_object) = DataObject::find_and_compare($this->resource, $d, $this->content_manager); $GLOBALS['db_connection']->commit(); if (@(!$data_object->id)) { return false; } $vetted_id = Vetted::unknown()->id; $visibility_id = Visibility::preview()->id; if ($existing_data_object) { // if($existing_data_object && ($this->resource->title != "Wikipedia" || $status == "Unchanged") && $v = $existing_data_object->best_vetted()) if ($existing_data_object && ($v = $existing_data_object->best_vetted())) { $vetted_id = $v->id; } if ($existing_data_object && ($v = $existing_data_object->best_visibility())) { if ($v != Visibility::visible()) { // if the existing object is visible - this will go on as preview // otherwise this will inherit the visibility (unpublished) $visibility_id = $v->id; } } } // we only delete the object's entries the first time we see it, to allow for multiple taxa per object if (!isset($this->harvested_data_object_ids[$data_object->id])) { $data_object->delete_hierarchy_entries(); } $this->harvested_data_object_ids[$data_object->id] = 1; $hierarchy_entry->add_data_object($data_object->id, $vetted_id, $visibility_id); $this->resource->harvest_event->add_data_object($data_object, $status); if ($status != "Reused") { $i = 0; $data_object->delete_agents(); foreach ($parameters['agents'] as &$a) { $agent = Agent::find_or_create($a); if ($agent->logo_url && !$agent->logo_cache_url) { if ($logo_cache_url = $this->content_manager->grab_file($agent->logo_url, "partner")) { $agent->logo_cache_url = $logo_cache_url; $agent->save(); } } $data_object->add_agent($agent->id, @$a['agent_role']->id ?: 0, $i); unset($a); $i++; } $data_object->delete_audiences(); foreach ($parameters['audiences'] as &$a) { $data_object->add_audience($a->id); unset($a); } $data_object->delete_info_items(); $data_object->delete_table_of_contents(); if (@$parameters['info_items']) { foreach ($parameters['info_items'] as &$ii) { $data_object->add_info_item($ii->id); unset($ii); } } $data_object->delete_refs(); if (@$parameters['refs']) { foreach ($parameters['refs'] as &$r) { if (@$r->id) { $data_object->add_reference($r->id); $r->publish(); } unset($r); } } } }
function get_word_count($taxon_concept_id, $chapter) { $concept_data_object_counts = array(); $text_id = DataType::find_or_create_by_schema_value('http://purl.org/dc/dcmitype/Text')->id; $trusted_id = Vetted::trusted()->id; $untrusted_id = Vetted::untrusted()->id; $unreviewed_id = Vetted::unknown()->id; if ($chapter == "brief summary") { $toc_id = TranslatedTableOfContent::find_or_create_by_label('Brief Summary')->table_of_contents_id; } elseif ($chapter == "comprehensive description") { $toc_id = TranslatedTableOfContent::find_or_create_by_label('Comprehensive Description')->table_of_contents_id; } $query = "SELECT dotoc.toc_id,do.description, dohe.vetted_id FROM data_objects_taxon_concepts dotc \n JOIN data_objects do ON dotc.data_object_id = do.id LEFT JOIN data_objects_table_of_contents dotoc ON do.id = dotoc.data_object_id \n JOIN data_objects_hierarchy_entries dohe on do.id = dohe.data_object_id\n WHERE do.published = 1 AND dohe.visibility_id =" . Visibility::visible()->id . " AND do.data_type_id = {$text_id} AND dotc.taxon_concept_id = {$taxon_concept_id} AND dotoc.toc_id = {$toc_id}\n UNION\n SELECT dotoc.toc_id,do.description, udo.vetted_id FROM data_objects_taxon_concepts dotc \n JOIN data_objects do ON dotc.data_object_id = do.id LEFT JOIN data_objects_table_of_contents dotoc ON do.id = dotoc.data_object_id \n JOIN users_data_objects udo on do.id = udo.data_object_id\n WHERE do.published = 1 AND udo.visibility_id =" . Visibility::visible()->id . " AND do.data_type_id = {$text_id} AND dotc.taxon_concept_id = {$taxon_concept_id} AND dotoc.toc_id = {$toc_id}"; $result = $this->mysqli_slave->query($query); while ($result && ($row = $result->fetch_assoc())) { $description = $row['description']; $vetted_id = $row['vetted_id']; $words_count = str_word_count(strip_tags($description), 0); @($concept_data_object_counts['total_w'] += $words_count); if ($vetted_id == $trusted_id) { @($concept_data_object_counts['t_w'] += $words_count); } elseif ($vetted_id == $untrusted_id) { @($concept_data_object_counts['ut_w'] += $words_count); } elseif ($vetted_id == $unreviewed_id) { @($concept_data_object_counts['ur_w'] += $words_count); } } return @$concept_data_object_counts['total_w']; }
private function load_all_user_object_associations() { $this->user_data_objects_taxa = array(); $query = "SELECT do.id, udo.taxon_concept_id\n FROM data_objects do\n JOIN users_data_objects udo ON (do.id=udo.data_object_id)\n WHERE do.published = 1\n AND udo.vetted_id IN (" . implode(",", array(Vetted::trusted()->id, Vetted::unknown()->id)) . ")\n AND udo.visibility_id = " . Visibility::visible()->id; foreach ($this->mysqli_slave->iterate($query) as $row) { $this->user_data_objects_taxa[$row['id']][$row['taxon_concept_id']] = true; } }
function get_data_objects_count($batch_size = 100000) { $image_id = DataType::image()->id; $text_id = DataType::text()->id; $video_id = DataType::video()->id; $sound_id = DataType::sound()->id; $flash_id = DataType::flash()->id; $youtube_id = DataType::youtube()->id; $iucn_id = DataType::iucn()->id; $data_type_label[$text_id] = 'text'; $data_type_label[$video_id] = 'video'; $data_type_label[$sound_id] = 'sound'; $data_type_label[$flash_id] = 'flash'; $data_type_label[$youtube_id] = 'youtube'; $data_type_label[$iucn_id] = 'iucn'; $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn"); $trusted_id = Vetted::trusted()->id; $untrusted_id = Vetted::untrusted()->id; $unreviewed_id = Vetted::unknown()->id; $raw_stats = array(); $concept_info_items = array(); $concept_references = array(); for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) { $this->print_status($i, $batch_size); $sql = "SELECT do.guid,\n dotc.taxon_concept_id,\n do.data_type_id,\n doii.info_item_id,\n dor.ref_id,\n REPLACE(REPLACE(do.description, '\\\\n', ' '), '\\\\r', ' '),\n dohe.vetted_id,\n do.id\n FROM data_objects_taxon_concepts dotc\n STRAIGHT_JOIN data_objects do ON (dotc.data_object_id = do.id)\n JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)\n LEFT JOIN data_objects_info_items doii ON (do.id = doii.data_object_id)\n LEFT JOIN data_objects_refs dor ON (do.id = dor.data_object_id)\n WHERE do.published = 1 AND dohe.visibility_id = " . Visibility::visible()->id . " AND do.data_type_id != {$image_id}"; if ($this->test_taxon_concept_ids) { $sql .= " AND dotc.taxon_concept_id IN (" . $this->test_taxon_concept_ids . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $counted_data_objects = array(); foreach ($this->mysqli_slave->iterate_file($sql) as $row_number => $row) { $taxon_concept_id = trim($row[0]); $data_type_id = trim($row[1]); $info_item_id = trim($row[2]); $ref_id = trim($row[3]); $description = trim($row[4]); $vetted_id = trim($row[5]); $data_object_id = trim($row[6]); if (isset($counted_data_objects[$taxon_concept_id][$data_object_id])) { continue; } $counted_data_objects[$taxon_concept_id][$data_object_id] = 1; $label = @$data_type_label[$data_type_id]; $words_count = str_word_count(strip_tags($description), 0); @$raw_stats[$taxon_concept_id][$label]['total']++; @($raw_stats[$taxon_concept_id][$label]['total_w'] += $words_count); if ($vetted_id == $trusted_id) { @$raw_stats[$taxon_concept_id][$label]['t']++; @($raw_stats[$taxon_concept_id][$label]['t_w'] += $words_count); } elseif ($vetted_id == $untrusted_id) { @$raw_stats[$taxon_concept_id][$label]['ut']++; @($raw_stats[$taxon_concept_id][$label]['ut_w'] += $words_count); } elseif ($vetted_id == $unreviewed_id) { @$raw_stats[$taxon_concept_id][$label]['ur']++; @($raw_stats[$taxon_concept_id][$label]['ur_w'] += $words_count); } $concept_info_items[$taxon_concept_id][$info_item_id] = ''; $concept_references[$taxon_concept_id][$ref_id] = ''; } foreach ($raw_stats as $taxon_concept_id => $stats) { $new_value = ""; # the stats need to go into the file in a certain order to be imported into the MySQL table foreach ($data_type_order_in_file as $data_type) { $new_value = @$stats[$data_type]['total']; $new_value .= "\t" . @$stats[$data_type]['t']; $new_value .= "\t" . @$stats[$data_type]['ut']; $new_value .= "\t" . @$stats[$data_type]['ur']; $new_value .= "\t" . @$stats[$data_type]['total_w']; $new_value .= "\t" . @$stats[$data_type]['t_w']; $new_value .= "\t" . @$stats[$data_type]['ut_w']; $new_value .= "\t" . @$stats[$data_type]['ur_w']; } $raw_stats[$taxon_concept_id] = $new_value; } $this->save_category_stats($raw_stats, "get_data_objects_count"); $raw_stats = array(); if ($this->test_taxon_concept_ids) { break; } } // $this->save_to_json_file($concept_info_items, "concept_info_items"); // unset($concept_info_items); // // $this->save_to_json_file($concept_references, "concept_references"); // unset($concept_references); }
function get_data_objects_count($batch_size = 100000) { $time_start = time_elapsed(); $concept_data_object_counts = array(); $concept_data_object_maps = array(); $concept_info_items = array(); $concept_references = array(); $image_id = DataType::image()->id; $map_id = DataType::map()->id; $text_id = DataType::text()->id; $video_id = DataType::video()->id; $sound_id = DataType::sound()->id; $flash_id = DataType::flash()->id; $youtube_id = DataType::youtube()->id; $iucn_id = DataType::iucn()->id; $data_type_label[$image_id] = 'image'; $data_type_label[$sound_id] = 'sound'; $data_type_label[$text_id] = 'text'; $data_type_label[$video_id] = 'video'; $data_type_label[$iucn_id] = 'iucn'; $data_type_label[$flash_id] = 'flash'; $data_type_label[$youtube_id] = 'youtube'; $trusted_id = Vetted::trusted()->id; $untrusted_id = Vetted::untrusted()->id; $unreviewed_id = Vetted::unknown()->id; for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) { print "\n dataObjects, its infoItems, its references [2 of 14] {$i} \n"; $sql = "SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, dohe.vetted_id, do.data_subtype_id\r\n FROM data_objects_taxon_concepts dotc \r\n JOIN data_objects do ON dotc.data_object_id = do.id \r\n LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n JOIN data_objects_hierarchy_entries dohe on do.id = dohe.data_object_id\r\n WHERE do.published=1 AND dohe.visibility_id=" . Visibility::visible()->id . " AND dohe.vetted_id != {$untrusted_id} "; //." AND do.data_type_id <> $image_id "; this has to be removed to count maps if (isset($GLOBALS['test_taxon_concept_ids'])) { $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $sql .= "\r\n UNION\r\n SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, udo.vetted_id, do.data_subtype_id\r\n FROM data_objects_taxon_concepts dotc \r\n JOIN data_objects do ON dotc.data_object_id = do.id \r\n LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n JOIN users_data_objects udo on do.id = udo.data_object_id\r\n WHERE do.published=1 AND udo.visibility_id=" . Visibility::visible()->id . "\r\n "; if (isset($GLOBALS['test_taxon_concept_ids'])) { $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $outfile = $this->mysqli_slave->select_into_outfile($sql); $FILE = fopen($outfile, "r"); if (!$FILE) { print "!! ERROR: Could not read {$outfile}"; debug("!! ERROR: Could not read {$outfile}"); return; } $num_rows = 0; while (!feof($FILE)) { if ($line = fgets($FILE)) { $num_rows++; $line = trim($line); $fields = explode("\t", $line); $tc_id = trim($fields[0]); $data_type_id = trim($fields[1]); $info_item_id = trim($fields[2]); $ref_id = trim($fields[3]); $description = trim($fields[4]); $vetted_id = trim($fields[5]); $data_subtype_id = trim($fields[6]); $label = @$data_type_label[$data_type_id]; if ($data_subtype_id != $map_id) { $words_count = str_word_count(strip_tags($description), 0); @$concept_data_object_counts[$tc_id][$label]['total']++; @($concept_data_object_counts[$tc_id][$label]['total_w'] += $words_count); if ($vetted_id == $trusted_id) { @$concept_data_object_counts[$tc_id][$label]['t']++; @($concept_data_object_counts[$tc_id][$label]['t_w'] += $words_count); } elseif ($vetted_id == $untrusted_id) { @$concept_data_object_counts[$tc_id][$label]['ut']++; @($concept_data_object_counts[$tc_id][$label]['ut_w'] += $words_count); } elseif ($vetted_id == $unreviewed_id) { @$concept_data_object_counts[$tc_id][$label]['ur']++; @($concept_data_object_counts[$tc_id][$label]['ur_w'] += $words_count); } $concept_info_items[$tc_id][$info_item_id] = ''; $concept_references[$tc_id][$ref_id] = ''; } else { @$concept_data_object_maps[$tc_id][$label]['total']++; if ($vetted_id == $trusted_id) { @$concept_data_object_maps[$tc_id][$label]['t']++; } elseif ($vetted_id == $untrusted_id) { @$concept_data_object_maps[$tc_id][$label]['ut']++; } elseif ($vetted_id == $unreviewed_id) { @$concept_data_object_maps[$tc_id][$label]['ur']++; } } } } fclose($FILE); unlink($outfile); print "\n num_rows: {$num_rows}"; } self::save_to_json_file($concept_info_items, "concept_info_items"); unset($concept_info_items); self::save_to_json_file($concept_references, "concept_references"); unset($concept_references); //save map data to be accessed later self::save_to_json_file($concept_data_object_maps, "map_counts"); unset($concept_data_object_maps); //convert associative array to a regular array $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn"); foreach ($concept_data_object_counts as $taxon_concept_id => $taxon_object_counts) { $new_value = ""; foreach ($data_type_order_in_file as $data_type) { $new_value .= "\t" . @$taxon_object_counts[$data_type]['total']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['t']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['total_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['t_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur_w']; } $concept_data_object_counts[$taxon_concept_id] = $new_value; } print "\n get_data_objects_count():" . (time_elapsed() - $time_start) / 60 . " minutes"; self::save_totals_to_cumulative_txt($concept_data_object_counts, "tpm_data_objects"); unset($concept_data_object_counts); }
public function lookup_blurbs() { if (!file_exists(DOC_ROOT . '/tmp/google_blurbs.txt')) { $this->download_blurb_info(); } if (!file_exists(DOC_ROOT . '/tmp/google_blurbs.txt')) { return; } $hierarchy_ids = array(); $num_lines = 0; $taxon_concept_ids = array(); $trusted_id = Vetted::trusted()->id; $this->all_blurb_info = array(); $hierarchy_attribution = array(120 => 'Animal Diversity Web', 155 => 'ARKive', 610 => 'Tree of Life', 116 => 'BioPedia', 119 => 'AmphibiaWeb', 131 => 'Illinois Wildflowers', 138 => 'USDA', 140 => 'University of Alberta', 431 => 'Wikipedia'); foreach (new FileIterator(DOC_ROOT . '/tmp/google_blurbs.txt') as $line_number => $line) { $row = explode("\t", $line); if (@(!$row[1])) { continue; } $data_object_id = $row[0]; $data_rating = $row[1]; $toc_id = $row[2]; $vetted_id = $row[3]; $hierarchy_id = $row[4]; $taxon_concept_id = $row[5]; $license = $row[6]; // Wikipedia can be unreviewed or trusted - everything else must be trusted if ($vetted_id != $trusted_id) { continue; } if ($hierarchy_id == 120 && $toc_id != 267) { continue; } if ($hierarchy_id == 155 && $toc_id != 308) { continue; } if ($hierarchy_id == 610 && $toc_id != 2) { continue; } if ($hierarchy_id == 116 && $toc_id != 308) { continue; } if ($hierarchy_id == 119 && $toc_id != 308) { continue; } if ($hierarchy_id == 131 && $toc_id != 308) { continue; } if ($hierarchy_id == 138 && $toc_id != 308) { continue; } if ($hierarchy_id == 140 && $toc_id != 308) { continue; } if ($hierarchy_id == 431 && $toc_id != 300) { continue; } if ($hierarchy_id == 431) { continue; } $this->all_blurb_info[$data_object_id] = array('data_object_id' => $data_object_id, 'data_rating' => $data_rating, 'toc_id' => $toc_id, 'vetted_id' => $vetted_id, 'attribution' => $hierarchy_attribution[$hierarchy_id], 'taxon_concept_id' => $taxon_concept_id, 'license' => $license); } foreach (new FileIterator(DOC_ROOT . '/tmp/google_blurbs_udo.txt') as $line_number => $line) { $row = explode("\t", $line); if (@(!$row[1])) { continue; } $data_object_id = $row[0]; $data_rating = $row[1]; $toc_id = $row[2]; $vetted_id = $row[3]; $taxon_concept_id = $row[4]; $given_name = $row[5]; $family_name = $row[6]; $license = $row[7]; $this->all_blurb_info[$data_object_id] = array('data_object_id' => $data_object_id, 'data_rating' => $data_rating, 'toc_id' => $toc_id, 'vetted_id' => $vetted_id, 'attribution' => trim($given_name . " " . $family_name), 'taxon_concept_id' => $taxon_concept_id, 'license' => $license); } $this->add_blurb_descriptions(); if (!$this->taxon_concept_blurbs) { return; } ksort($this->taxon_concept_blurbs); foreach ($this->taxon_concept_blurbs as $taxon_concept_id => $blurb_info) { fwrite($this->BLURBS_OUT, "{$taxon_concept_id}\t" . $blurb_info['description'] . "\t" . $blurb_info['attribution'] . "\t" . $blurb_info['license'] . "\n"); } }
function process_do($harvest_event_id, $taxa_count, $published, $agent_name, $agent_id, $ctr, $resource_title) { if ($agent_id == 27) { $datatype = array(1 => array("label" => "IUCN", "id" => "6")); } else { $datatype = array(1 => array("label" => "Image", "id" => "1"), 2 => array("label" => "Sound", "id" => "2"), 3 => array("label" => "Text", "id" => "3"), 4 => array("label" => "Video", "id" => "4"), 5 => array("label" => "Flash", "id" => "7"), 6 => array("label" => "YouTube", "id" => "8")); } //start initialize $vetted_type = array(1 => array("id" => Vetted::find("unknown"), "label" => "Unknown"), 2 => array("id" => Vetted::find("untrusted"), "label" => "Untrusted"), 3 => array("id" => Vetted::find("trusted"), "label" => "Trusted")); for ($i = 1; $i <= count($datatype); $i++) { for ($j = 1; $j <= count($vetted_type); $j++) { $str1 = $vetted_type[$j]['id']; $str2 = $datatype[$i]["id"]; $do[$str1][$str2] = array(); } } //end initialize $qry = "Select data_objects.id, data_objects.data_type_id, data_objects.vetted_id From data_objects_harvest_events Inner Join data_objects ON data_objects_harvest_events.data_object_id = data_objects.id Where data_objects_harvest_events.harvest_event_id = {$harvest_event_id}"; $result = $this->mysqli->query($qry); while ($result && ($row = $result->fetch_assoc())) { $id = $row["id"]; $data_type_id = $row["data_type_id"]; $vetted_id = $row["vetted_id"]; $do[$vetted_id][$data_type_id][$id] = true; } $param = array(); for ($i = 1; $i <= count($datatype); $i++) { for ($j = 1; $j <= count($vetted_type); $j++) { $str1 = $vetted_type[$j]['id']; $str2 = $datatype[$i]["id"]; $param[] = count($do[$str1][$str2]); } } $arr = $param; for ($j = 1; $j <= count($datatype); $j++) { $sum[$j] = 0; } if ($ctr % 2 == 0) { $color = ''; } else { $color = 'aqua'; } print "\n <table bgcolor='{$color}' cellpadding='3' cellspacing='0' border='1' style='font-size : x-small; font-family : Arial Narrow;'> \n <tr><td colspan='24'>\n <table>\n <tr><td>\n Agent: <a target='eol' href='http://www.eol.org/administrator/content_partner_report/show/{$agent_id}'>{$agent_name}</a>\n [{$resource_title}] \n <font size='2'>" . self::iif($published, "Published: {$published}", "-not yet published-") . " Harvest event id: {$harvest_event_id}</font>\n </td></tr>\n </table>\n </td></tr> \n <tr align='center'>"; for ($i = 1; $i <= count($datatype); $i++) { print "<td colspan='3'>" . $datatype[$i]["label"] . "</td>"; } print "</tr>"; print "\n <tr align='center'>"; $k = 0; for ($j = 1; $j <= count($datatype); $j++) { for ($i = 1; $i <= count($vetted_type); $i++) { print "<td>" . $vetted_type[$i]['label'] . "</td>"; $index = $datatype[$j]["id"]; @($sum[$index] = @$sum[$index] + $arr[$k]); $k++; } } print "</tr>"; print "\n <tr align='center'>"; for ($i = 0; $i < count($arr); $i++) { print "<Td align='right'>" . $arr[$i] . "</td>"; } print "</tr>"; print "\n <tr align='center'>"; $k = 0; for ($j = 1; $j <= count($datatype); $j++) { print "<td colspan='3' align='right'>" . number_format($sum[$datatype[$j]["id"]]) . "</td>"; } print "</tr>"; print " \n <tr><td colspan='24'>\n <table> \n <tr><td>Taxa count: </td><td align='right'>" . number_format($taxa_count, 0) . "</td></tr> \n <tr><td>Data objects: </td><td align='right'>" . number_format(array_sum($sum)) . "</td></tr>\n </table>\n </td></tr> \n </table>"; return ""; }