public function insert_data_object($row, $parameters) { self::debug_iterations("Inserting DataObject"); $this->commit_iterations("DataObject", 20); if ($this->archive_validator->has_error_by_line('http://eol.org/schema/media/document', $parameters['archive_table_definition']->location, $parameters['archive_line_number'])) { write_to_resource_harvesting_log("ERROR: insert_data_object: has_error_by_line" . ",file_location:" . $parameters['archive_table_definition']->location . ",line_number:" . $parameters['archive_line_number']); return false; } $object_taxon_ids = self::get_foreign_keys_from_row($row, 'http://rs.tdwg.org/dwc/terms/taxonID'); $object_taxon_info = array(); if ($object_taxon_ids) { foreach ($object_taxon_ids as $taxon_id) { if ($taxon_info = @$this->taxon_ids_inserted[$taxon_id]) { self::uncompress_array($taxon_info); $object_taxon_info[] = $taxon_info; } } } if (!$object_taxon_info) { return false; } if ($this->harvest_event->resource->is_eol_flickr_group() && self::is_this_flickr_image_in_inaturalist($row)) { return false; } $data_object = new DataObject(); $data_object->identifier = @self::field_decode($row['http://purl.org/dc/terms/identifier']); if (isset($this->media_ids_inserted[$data_object->identifier])) { return false; } $data_object->data_type = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://purl.org/dc/terms/type'])); if ($dt = DataType::find_or_create_by_schema_value(@self::field_decode($row['http://rs.tdwg.org/audubon_core/subtype']))) { $data_object->data_subtype_id = $dt->id; } $data_object->mime_type = MimeType::find_or_create_by_translated_label(@self::field_decode($row['http://purl.org/dc/terms/format'])); $data_object->object_created_at = @self::field_decode($row['http://ns.adobe.com/xap/1.0/CreateDate']); $data_object->object_modified_at = @self::field_decode($row['http://purl.org/dc/terms/modified']); $data_object->available_at = @self::field_decode($row['http://purl.org/dc/terms/available']); $data_object->object_title = @self::field_decode($row['http://purl.org/dc/terms/title']); $data_object->language = Language::find_or_create_for_parser(@self::field_decode($row['http://purl.org/dc/terms/language'])); // check multiple fields for a value of license if (isset($row['http://purl.org/dc/terms/license'])) { $license_string = @self::field_decode($row['http://purl.org/dc/terms/license']); } else { $license_string = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/UsageTerms']); } // convert British licences to American licenses $license_string = str_replace("creativecommons.org/licences/", "creativecommons.org/licenses/", $license_string); if (!$license_string && $this->harvest_event->resource->license && $this->harvest_event->resource->license->source_url) { $license_string = $this->harvest_event->resource->license->source_url; } if (!$license_string || !\eol_schema\MediaResource::valid_license($license_string)) { return false; } $data_object->license = License::find_or_create_for_parser($license_string); $data_object->rights_statement = @self::field_decode($row['http://purl.org/dc/terms/rights']); $data_object->rights_holder = @self::field_decode($row['http://ns.adobe.com/xap/1.0/rights/Owner']); $data_object->bibliographic_citation = @self::field_decode($row['http://purl.org/dc/terms/bibliographicCitation']); $data_object->source_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/furtherInformationURL']); $data_object->derived_from = @self::field_decode($row['http://rs.tdwg.org/ac/terms/derivedFrom']); $data_object->description = @self::field_decode($row['http://purl.org/dc/terms/description']); // Turn newlines into paragraphs $data_object->description = str_replace("\n", "</p><p>", $data_object->description); $data_object->object_url = @self::field_decode($row['http://rs.tdwg.org/ac/terms/accessURI']); $data_object->thumbnail_url = @self::field_decode($row['http://eol.org/schema/media/thumbnailURL']); $data_object->location = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/LocationCreated']); $data_object->spatial_location = @self::field_decode($row['http://purl.org/dc/terms/spatial']); $data_object->latitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#lat']); $data_object->longitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#long']); $data_object->altitude = @self::field_decode($row['http://www.w3.org/2003/01/geo/wgs84_pos#alt']); $rating = @self::field_decode($row['http://ns.adobe.com/xap/1.0/Rating']); // ratings may be 0 to 5 // TODO: technically 0 means untrusted, and then anywhere from 1-5 is OK. // 0.5 for example isn't really valid acording to the schema if (is_numeric($rating) && $rating > 0 && $rating <= 5) { $data_object->data_rating = $rating; } //TODO - update this if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) { $data_object->data_type = DataType::youtube(); $data_object->data_type_id = DataType::youtube()->id; } // //take the first available source_url of one of this object's taxa if (!@$data_object->source_url && @$taxon_parameters["source_url"]) { foreach ($object_taxon_info as $taxon_info) { if ($source_url = $taxon_info['source_url']) { $data_object->source_url = $source_url; break; } } } /* Checking requirements */ // if text: must have description if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) { return false; } // if image, movie or sound: must have object_url if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) { return false; } /* ADDING THE DATA OBJECT */ list($data_object, $status) = DataObject::find_and_compare($this->harvest_event->resource, $data_object, $this->content_manager); if (@(!$data_object->id)) { return false; } $this->media_ids_inserted[$data_object->identifier] = $data_object->id; $this->harvest_event->add_data_object($data_object, $status); $data_object->delete_hierarchy_entries(); $vetted_id = Vetted::unknown()->id; $visibility_id = Visibility::preview()->id; foreach ($object_taxon_info as $taxon_info) { $he_id = $taxon_info['hierarchy_entry_id']; $tc_id = $taxon_info['taxon_concept_id']; $this->mysqli->insert("INSERT IGNORE INTO data_objects_hierarchy_entries (hierarchy_entry_id, data_object_id, vetted_id, visibility_id) VALUES ({$he_id}, {$data_object->id}, {$vetted_id}, {$visibility_id})"); $this->mysqli->insert("INSERT IGNORE INTO data_objects_taxon_concepts (taxon_concept_id, data_object_id) VALUES ({$tc_id}, {$data_object->id})"); } // a few things to add after the DataObject is inserted // keep track of reference foreign keys self::append_foreign_keys_from_row($row, 'http://eol.org/schema/reference/referenceID', $this->media_reference_ids, $data_object->id, $data_object->guid); // keep track of agent foreign keys self::append_foreign_keys_from_row($row, 'http://eol.org/schema/agent/agentID', $this->media_agent_ids, $data_object->id); $data_object->delete_info_items(); $data_object->delete_table_of_contents(); if ($s = @self::field_decode($row['http://iptc.org/std/Iptc4xmpExt/1.0/xmlns/CVterm'])) { $ii = InfoItem::find_or_create_by_schema_value($s); $data_object->add_info_item($ii->id); unset($ii); } if ($a = @self::field_decode($row['http://purl.org/dc/terms/audience'])) { $a = Audience::find_or_create_by_translated_label(trim((string) $a)); $data_object->add_audience($a->id); unset($a); } $data_object_parameters["agents"] = array(); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/creator', 'Creator'); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/publisher', 'Publisher'); self::append_agents($row, $data_object_parameters, 'http://purl.org/dc/terms/contributor', 'Contributor'); $data_object->delete_agents(); $i = 0; foreach ($data_object_parameters['agents'] as &$a) { $agent = Agent::find_or_create($a); if ($agent->logo_url && !$agent->logo_cache_url) { if ($logo_cache_url = $this->content_manager->grab_file($agent->logo_url, "partner")) { $agent->logo_cache_url = $logo_cache_url; $agent->save(); } } $data_object->add_agent($agent->id, @$a['agent_role']->id ?: 0, $i); unset($a); $i++; } if (!isset($this->object_references_deleted[$data_object->id])) { $data_object->delete_refs(); $this->object_references_deleted[$data_object->id] = true; } // add data object info to resource contribution if ($status != "Unchanged") { $result = $this->mysqli->query("SELECT id, source_url, taxon_concept_id, hierarchy_id, identifier FROM hierarchy_entries inner join data_objects_hierarchy_entries on hierarchy_entries.id = data_objects_hierarchy_entries.hierarchy_entry_id where data_object_id =" . $data_object->id); if ($result && ($row = $result->fetch_assoc())) { $hierarchy_entry_id = $row["id"]; $source = "'" . $this->get_hierarchy_entry_outlink($row["hierarchy_id"], $row["identifier"], preg_replace('/\'/', "\\'", $row["source_url"])) . "'"; $identifier = "'" . $row["identifier"] . "'"; $taxon_concept_id = $row["taxon_concept_id"]; } $resource_id = $this->harvest_event->resource_id; $this->mysqli->insert("INSERT IGNORE INTO resource_contributions (resource_id, data_object_id, data_point_uri_id, hierarchy_entry_id, taxon_concept_id, source, object_type, identifier, data_object_type) VALUES ({$resource_id}, {$data_object->id}, NULL, {$hierarchy_entry_id}, {$taxon_concept_id}, {$source}, 'data_object', {$identifier}, {$data_object->data_type_id})"); } }
public static function read_taxon_xml($t, $resource) { $t_dc = $t->children("http://purl.org/dc/elements/1.1/"); $t_dcterms = $t->children("http://purl.org/dc/terms/"); $t_dwc = $t->children("http://rs.tdwg.org/dwc/dwcore/"); $taxon_parameters = array(); $taxon_parameters["identifier"] = Functions::import_decode($t_dc->identifier); $taxon_parameters["source_url"] = Functions::import_decode($t_dc->source); $taxon_parameters["kingdom"] = Functions::import_decode($t_dwc->Kingdom); $taxon_parameters["phylum"] = Functions::import_decode($t_dwc->Phylum); $taxon_parameters["class"] = Functions::import_decode($t_dwc->Class); $taxon_parameters["order"] = Functions::import_decode($t_dwc->Order); $taxon_parameters["family"] = Functions::import_decode($t_dwc->Family); $taxon_parameters["genus"] = Functions::import_decode($t_dwc->Genus); $taxon_parameters["scientific_name"] = Functions::import_decode($t_dwc->ScientificName); $taxon_parameters["rank"] = Rank::find_or_create_by_translated_label(Functions::import_decode($t->rank)); $taxon_parameters["taxon_created_at"] = trim($t_dcterms->created); $taxon_parameters["taxon_modified_at"] = trim($t_dcterms->modified); if ($taxon_parameters["scientific_name"]) { $taxon_parameters["name"] = Name::find_or_create_by_string($taxon_parameters["scientific_name"]); } else { if ($name = $taxon_parameters["genus"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["genus"] = ""; } elseif ($name = $taxon_parameters["family"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["family"] = ""; } elseif ($name = $taxon_parameters["order"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["order"] = ""; } elseif ($name = $taxon_parameters["class"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["class"] = ""; } elseif ($name = $taxon_parameters["phylum"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["phylum"] = ""; } elseif ($name = $taxon_parameters["kingdom"]) { $taxon_parameters["scientific_name"] = $name; $taxon_parameters["name"] = Name::find_or_create_by_string($name); $taxon_parameters["kingdom"] = ""; } else { return; } } $taxon_parameters["common_names"] = array(); foreach ($t->commonName as $c) { $common_name = Functions::import_decode((string) $c); if (!$common_name) { continue; } $xml_attr = $c->attributes("http://www.w3.org/XML/1998/namespace"); $params = array("name" => $common_name, "language" => Language::find_or_create_for_parser(@Functions::import_decode($xml_attr["lang"]))); $taxon_parameters["common_names"][] = $params; } $taxon_parameters["synonyms"] = array(); foreach ($t->synonym as $s) { $synonym = Functions::import_decode((string) $s); if (!$synonym) { continue; } $attr = $s->attributes(); if (!@$attr["relationship"]) { $attr["relationship"] = 'synonym'; } $params = array("name" => Name::find_or_create_by_string($synonym), "synonym_relation" => SynonymRelation::find_or_create_by_translated_label(trim($attr["relationship"]))); $taxon_parameters["synonyms"][] = $params; } $taxon_parameters["agents"] = array(); foreach ($t->agent as $a) { $agent_name = Functions::import_decode((string) $a); if (!$agent_name) { continue; } $attr = $a->attributes(); $params = array("full_name" => Functions::import_decode((string) $a, 0, 0), "homepage" => @Functions::import_decode($attr["homepage"]), "logo_url" => @Functions::import_decode($attr["logoURL"]), "agent_role" => AgentRole::find_or_create_by_translated_label(@trim($attr["role"]))); $taxon_parameters["agents"][] = $params; unset($params); } $taxon_parameters["refs"] = array(); foreach ($t->reference as $r) { $reference = Functions::import_decode((string) $r, 0, 0); if (!$reference) { continue; } $ref = Reference::find_or_create_by_full_reference($reference); $taxon_parameters["refs"][] = $ref; $id_labels = array("bici", "coden", "doi", "eissn", "handle", "issn", "isbn", "lsid", "oclc", "sici", "url", "urn"); $attr = $r->attributes(); foreach ($id_labels as $label) { if ($id = @Functions::import_decode($attr[$label], 0, 0)) { $type = RefIdentifierType::find_or_create_by_label($label); $ref->add_ref_identifier(@$type->id ?: 0, $id); } } } $taxon_parameters["data_objects"] = array(); foreach ($t->dataObject as $d) { $d_dc = $d->children("http://purl.org/dc/elements/1.1/"); $d_dcterms = $d->children("http://purl.org/dc/terms/"); $d_geo = $d->children("http://www.w3.org/2003/01/geo/wgs84_pos#"); $data_object = new DataObject(); $data_object->identifier = Functions::import_decode($d_dc->identifier); $data_object->data_type = DataType::find_or_create_by_schema_value(Functions::import_decode($d->dataType)); $data_object->mime_type = MimeType::find_or_create_by_translated_label(Functions::import_decode($d->mimeType)); $data_object->object_created_at = Functions::import_decode($d_dcterms->created); $data_object->object_modified_at = Functions::import_decode($d_dcterms->modified); $data_object->object_title = Functions::import_decode($d_dc->title, 0, 0); $data_object->language = Language::find_or_create_for_parser(Functions::import_decode($d_dc->language)); $data_object->license = License::find_or_create_for_parser(Functions::import_decode($d->license)); $data_object->rights_statement = Functions::import_decode($d_dc->rights, 0, 0); $data_object->rights_holder = Functions::import_decode($d_dcterms->rightsHolder, 0, 0); $data_object->bibliographic_citation = Functions::import_decode($d_dcterms->bibliographicCitation, 0, 0); $data_object->source_url = Functions::import_decode($d_dc->source); $data_object->description = Functions::import_decode($d_dc->description, 0, 0); $data_object->object_url = Functions::import_decode($d->mediaURL); $data_object->thumbnail_url = Functions::import_decode($d->thumbnailURL); $data_object->location = Functions::import_decode($d->location, 0, 0); if (@$d->additionalInformation) { $data_object->additional_information = (array) $d->additionalInformation; } if ($r = (string) @$d->additionalInformation->rating) { if (is_numeric($r) && $r > 0 && $r <= 5) { $data_object->data_rating = $r; } } if ($subtype = @$d->additionalInformation->subtype) { if ($dt = DataType::find_or_create_by_schema_value(Functions::import_decode($subtype))) { $data_object->data_subtype_id = $dt->id; } } $data_object_parameters = array(); if (!$data_object->language) { $xml_attr = $d_dc->description->attributes("http://www.w3.org/XML/1998/namespace"); $data_object->language = Language::find_or_create_for_parser(@Functions::import_decode($xml_attr["lang"])); } if (!$data_object->language && $resource->language) { $data_object->language = $resource->language; } //TODO - update this if ($data_object->mime_type && $data_object->mime_type->equals(MimeType::flash()) && $data_object->is_video()) { $data_object->data_type = DataType::youtube(); $data_object->data_type_id = DataType::youtube()->id; } //take the taxon's source_url if none present if (!@$data_object->source_url && @$taxon_parameters["source_url"]) { $data_object->source_url = $taxon_parameters["source_url"]; } // Turn newlines into paragraphs $data_object->description = str_replace("\n", "</p><p>", $data_object->description); /* Checking requirements*/ //if text: must have description if ($data_object->data_type->equals(DataType::text()) && !$data_object->description) { continue; } //if image, movie or sound: must have object_url if (($data_object->data_type->equals(DataType::video()) || $data_object->data_type->equals(DataType::sound()) || $data_object->data_type->equals(DataType::image())) && !$data_object->object_url) { continue; } $data_object->latitude = 0; $data_object->longitude = 0; $data_object->altitude = 0; foreach ($d_geo->Point as $p) { $p_geo = $p->children("http://www.w3.org/2003/01/geo/wgs84_pos#"); $data_object->latitude = Functions::import_decode($p_geo->lat); $data_object->longitude = Functions::import_decode($p_geo->long); $data_object->altitude = Functions::import_decode($p_geo->alt); } $data_object_parameters["agents"] = array(); foreach ($d->agent as $a) { $agent_name = Functions::import_decode((string) $a); if (!$agent_name) { continue; } $attr = $a->attributes(); $params = array("full_name" => Functions::import_decode((string) $a, 0, 0), "homepage" => @Functions::import_decode($attr["homepage"]), "logo_url" => @Functions::import_decode($attr["logoURL"]), "agent_role" => AgentRole::find_or_create_by_translated_label(@trim($attr["role"]))); $data_object_parameters["agents"][] = $params; unset($params); } $data_object_parameters["audiences"] = array(); foreach ($d->audience as $a) { $data_object_parameters["audiences"][] = Audience::find_or_create_by_translated_label(trim((string) $a)); } $data_object_parameters["info_items"] = array(); foreach ($d->subject as $s) { $data_object_parameters["info_items"][] = InfoItem::find_or_create_by_schema_value(trim((string) $s)); } if ($subject = @$d->additionalInformation->subject) { if ($ii = InfoItem::find_or_create_by_schema_value(trim((string) $subject))) { $data_object_parameters["info_items"] = array($ii); } } // EXCEPTIONS if ($data_object->is_text()) { if ($resource->title == "BOLD Systems Resource") { // EXCEPTION - overriding the subject for BOLD $data_object_parameters["info_items"] = array(InfoItem::find_or_create_by_schema_value('http://www.eol.org/voc/table_of_contents#Barcode')); } elseif ($resource->title == "Wikipedia") { // EXCEPTION - overriding the subject for Wikipedia $data_object_parameters["info_items"] = array(InfoItem::find_or_create_by_schema_value('http://www.eol.org/voc/table_of_contents#Wikipedia')); } elseif ($resource->title == "IUCN Red List") { if ($data_object->object_title == "IUCNConservationStatus") { // EXCEPTION - overriding the data type for IUCN text $data_object->data_type_id = DataType::iucn()->id; $data_object->data_type = DataType::iucn(); } } } $data_object_parameters["refs"] = array(); foreach ($d->reference as $r) { $reference = Functions::import_decode((string) $r, 0, 0); if (!$reference) { continue; } $ref = Reference::find_or_create_by_full_reference($reference); $data_object_parameters["refs"][] = $ref; $id_labels = array("bici", "coden", "doi", "eissn", "handle", "issn", "isbn", "lsid", "oclc", "sici", "url", "urn"); $attr = $r->attributes(); foreach ($id_labels as $label) { if ($id = @Functions::import_decode($attr[$label], 0, 0)) { $type = RefIdentifierType::find_or_create_by_label($label); $ref->add_ref_identifier(@$type->id ?: 0, $id); } } } $taxon_parameters["data_objects"][] = array($data_object, $data_object_parameters); unset($data_object); } return $taxon_parameters; }
function get_data_objects_count($batch_size = 100000) { $time_start = time_elapsed(); $concept_data_object_counts = array(); $concept_data_object_maps = array(); $concept_info_items = array(); $concept_references = array(); $image_id = DataType::image()->id; $map_id = DataType::map()->id; $text_id = DataType::text()->id; $video_id = DataType::video()->id; $sound_id = DataType::sound()->id; $flash_id = DataType::flash()->id; $youtube_id = DataType::youtube()->id; $iucn_id = DataType::iucn()->id; $data_type_label[$image_id] = 'image'; $data_type_label[$sound_id] = 'sound'; $data_type_label[$text_id] = 'text'; $data_type_label[$video_id] = 'video'; $data_type_label[$iucn_id] = 'iucn'; $data_type_label[$flash_id] = 'flash'; $data_type_label[$youtube_id] = 'youtube'; $trusted_id = Vetted::trusted()->id; $untrusted_id = Vetted::untrusted()->id; $unreviewed_id = Vetted::unknown()->id; for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) { print "\n dataObjects, its infoItems, its references [2 of 14] {$i} \n"; $sql = "SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, dohe.vetted_id, do.data_subtype_id\r\n FROM data_objects_taxon_concepts dotc \r\n JOIN data_objects do ON dotc.data_object_id = do.id \r\n LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n JOIN data_objects_hierarchy_entries dohe on do.id = dohe.data_object_id\r\n WHERE do.published=1 AND dohe.visibility_id=" . Visibility::visible()->id . " AND dohe.vetted_id != {$untrusted_id} "; //." AND do.data_type_id <> $image_id "; this has to be removed to count maps if (isset($GLOBALS['test_taxon_concept_ids'])) { $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $sql .= "\r\n UNION\r\n SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, udo.vetted_id, do.data_subtype_id\r\n FROM data_objects_taxon_concepts dotc \r\n JOIN data_objects do ON dotc.data_object_id = do.id \r\n LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n JOIN users_data_objects udo on do.id = udo.data_object_id\r\n WHERE do.published=1 AND udo.visibility_id=" . Visibility::visible()->id . "\r\n "; if (isset($GLOBALS['test_taxon_concept_ids'])) { $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $outfile = $this->mysqli_slave->select_into_outfile($sql); $FILE = fopen($outfile, "r"); if (!$FILE) { print "!! ERROR: Could not read {$outfile}"; debug("!! ERROR: Could not read {$outfile}"); return; } $num_rows = 0; while (!feof($FILE)) { if ($line = fgets($FILE)) { $num_rows++; $line = trim($line); $fields = explode("\t", $line); $tc_id = trim($fields[0]); $data_type_id = trim($fields[1]); $info_item_id = trim($fields[2]); $ref_id = trim($fields[3]); $description = trim($fields[4]); $vetted_id = trim($fields[5]); $data_subtype_id = trim($fields[6]); $label = @$data_type_label[$data_type_id]; if ($data_subtype_id != $map_id) { $words_count = str_word_count(strip_tags($description), 0); @$concept_data_object_counts[$tc_id][$label]['total']++; @($concept_data_object_counts[$tc_id][$label]['total_w'] += $words_count); if ($vetted_id == $trusted_id) { @$concept_data_object_counts[$tc_id][$label]['t']++; @($concept_data_object_counts[$tc_id][$label]['t_w'] += $words_count); } elseif ($vetted_id == $untrusted_id) { @$concept_data_object_counts[$tc_id][$label]['ut']++; @($concept_data_object_counts[$tc_id][$label]['ut_w'] += $words_count); } elseif ($vetted_id == $unreviewed_id) { @$concept_data_object_counts[$tc_id][$label]['ur']++; @($concept_data_object_counts[$tc_id][$label]['ur_w'] += $words_count); } $concept_info_items[$tc_id][$info_item_id] = ''; $concept_references[$tc_id][$ref_id] = ''; } else { @$concept_data_object_maps[$tc_id][$label]['total']++; if ($vetted_id == $trusted_id) { @$concept_data_object_maps[$tc_id][$label]['t']++; } elseif ($vetted_id == $untrusted_id) { @$concept_data_object_maps[$tc_id][$label]['ut']++; } elseif ($vetted_id == $unreviewed_id) { @$concept_data_object_maps[$tc_id][$label]['ur']++; } } } } fclose($FILE); unlink($outfile); print "\n num_rows: {$num_rows}"; } self::save_to_json_file($concept_info_items, "concept_info_items"); unset($concept_info_items); self::save_to_json_file($concept_references, "concept_references"); unset($concept_references); //save map data to be accessed later self::save_to_json_file($concept_data_object_maps, "map_counts"); unset($concept_data_object_maps); //convert associative array to a regular array $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn"); foreach ($concept_data_object_counts as $taxon_concept_id => $taxon_object_counts) { $new_value = ""; foreach ($data_type_order_in_file as $data_type) { $new_value .= "\t" . @$taxon_object_counts[$data_type]['total']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['t']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['total_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['t_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut_w']; $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur_w']; } $concept_data_object_counts[$taxon_concept_id] = $new_value; } print "\n get_data_objects_count():" . (time_elapsed() - $time_start) / 60 . " minutes"; self::save_totals_to_cumulative_txt($concept_data_object_counts, "tpm_data_objects"); unset($concept_data_object_counts); }
function get_data_objects_count($batch_size = 100000) { $image_id = DataType::image()->id; $text_id = DataType::text()->id; $video_id = DataType::video()->id; $sound_id = DataType::sound()->id; $flash_id = DataType::flash()->id; $youtube_id = DataType::youtube()->id; $iucn_id = DataType::iucn()->id; $data_type_label[$text_id] = 'text'; $data_type_label[$video_id] = 'video'; $data_type_label[$sound_id] = 'sound'; $data_type_label[$flash_id] = 'flash'; $data_type_label[$youtube_id] = 'youtube'; $data_type_label[$iucn_id] = 'iucn'; $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn"); $trusted_id = Vetted::trusted()->id; $untrusted_id = Vetted::untrusted()->id; $unreviewed_id = Vetted::unknown()->id; $raw_stats = array(); $concept_info_items = array(); $concept_references = array(); for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) { $this->print_status($i, $batch_size); $sql = "SELECT do.guid,\n dotc.taxon_concept_id,\n do.data_type_id,\n doii.info_item_id,\n dor.ref_id,\n REPLACE(REPLACE(do.description, '\\\\n', ' '), '\\\\r', ' '),\n dohe.vetted_id,\n do.id\n FROM data_objects_taxon_concepts dotc\n STRAIGHT_JOIN data_objects do ON (dotc.data_object_id = do.id)\n JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)\n LEFT JOIN data_objects_info_items doii ON (do.id = doii.data_object_id)\n LEFT JOIN data_objects_refs dor ON (do.id = dor.data_object_id)\n WHERE do.published = 1 AND dohe.visibility_id = " . Visibility::visible()->id . " AND do.data_type_id != {$image_id}"; if ($this->test_taxon_concept_ids) { $sql .= " AND dotc.taxon_concept_id IN (" . $this->test_taxon_concept_ids . ")"; } else { $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size); } $counted_data_objects = array(); foreach ($this->mysqli_slave->iterate_file($sql) as $row_number => $row) { $taxon_concept_id = trim($row[0]); $data_type_id = trim($row[1]); $info_item_id = trim($row[2]); $ref_id = trim($row[3]); $description = trim($row[4]); $vetted_id = trim($row[5]); $data_object_id = trim($row[6]); if (isset($counted_data_objects[$taxon_concept_id][$data_object_id])) { continue; } $counted_data_objects[$taxon_concept_id][$data_object_id] = 1; $label = @$data_type_label[$data_type_id]; $words_count = str_word_count(strip_tags($description), 0); @$raw_stats[$taxon_concept_id][$label]['total']++; @($raw_stats[$taxon_concept_id][$label]['total_w'] += $words_count); if ($vetted_id == $trusted_id) { @$raw_stats[$taxon_concept_id][$label]['t']++; @($raw_stats[$taxon_concept_id][$label]['t_w'] += $words_count); } elseif ($vetted_id == $untrusted_id) { @$raw_stats[$taxon_concept_id][$label]['ut']++; @($raw_stats[$taxon_concept_id][$label]['ut_w'] += $words_count); } elseif ($vetted_id == $unreviewed_id) { @$raw_stats[$taxon_concept_id][$label]['ur']++; @($raw_stats[$taxon_concept_id][$label]['ur_w'] += $words_count); } $concept_info_items[$taxon_concept_id][$info_item_id] = ''; $concept_references[$taxon_concept_id][$ref_id] = ''; } foreach ($raw_stats as $taxon_concept_id => $stats) { $new_value = ""; # the stats need to go into the file in a certain order to be imported into the MySQL table foreach ($data_type_order_in_file as $data_type) { $new_value = @$stats[$data_type]['total']; $new_value .= "\t" . @$stats[$data_type]['t']; $new_value .= "\t" . @$stats[$data_type]['ut']; $new_value .= "\t" . @$stats[$data_type]['ur']; $new_value .= "\t" . @$stats[$data_type]['total_w']; $new_value .= "\t" . @$stats[$data_type]['t_w']; $new_value .= "\t" . @$stats[$data_type]['ut_w']; $new_value .= "\t" . @$stats[$data_type]['ur_w']; } $raw_stats[$taxon_concept_id] = $new_value; } $this->save_category_stats($raw_stats, "get_data_objects_count"); $raw_stats = array(); if ($this->test_taxon_concept_ids) { break; } } // $this->save_to_json_file($concept_info_items, "concept_info_items"); // unset($concept_info_items); // // $this->save_to_json_file($concept_references, "concept_references"); // unset($concept_references); }
public function save_eol_stats() { $stats = array(); // Overall Statistics $time_start = time_elapsed(); // Number of members $stats['members_count'] = $this->members_count(); // Number of communities $stats['communities_count'] = $this->communities_count(); // Number of collections $stats['collections_count'] = $this->collections_count(); // Total number of pages $stats['pages_count'] = $this->pages_count(); // as currently reported on home page; assume this means pages with at least one data object $stats['pages_with_content'] = $this->pages_with_content(); $stats['pages_with_text'] = $this->pages_with_text(); $stats['pages_with_image'] = $this->pages_with_image(); $stats['pages_with_map'] = $this->pages_with_map(); $stats['pages_with_video'] = $this->pages_with_video(); $stats['pages_with_sound'] = $this->pages_with_sound(); $stats['pages_without_text'] = $stats['pages_count'] - $this->pages_with_text(); $stats['pages_without_image'] = $stats['pages_count'] - $stats['pages_with_image']; $stats['pages_with_image_no_text'] = $this->pages_with_image_no_text(); $stats['pages_with_text_no_image'] = $this->pages_with_text_no_image(); // base pages - pages without any data objects; base pages may have references and BHL/content partner links $stats['base_pages'] = $this->pages_without_content_with_other_info(); print "\n Overall stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // Trusted Content Statistics - note change in terminology, phasing out vetted in favor of trusted $time_start = time_elapsed(); // Number of pages with at least one trusted data object $stats['pages_with_at_least_a_trusted_object'] = $this->pages_with_at_least_a_trusted_object(); $stats['pages_with_at_least_a_curatorial_action'] = $this->pages_curated(); print "\n Trusted content stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // BHL Statistics $time_start = time_elapsed(); $stats['pages_with_BHL_links'] = $this->pages_with_BHL_links(); $stats['pages_with_BHL_links_no_text'] = $this->pages_with_BHL_links_no_text(); $stats['pages_with_BHL_links_only'] = $this->pages_with_BHL_links_only(); print "\n BHL stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // Content Partner Statistics $time_start = time_elapsed(); // Number of publicly listed partners - as shown on home page (This includes all published partners // and a few partners that have been listed although they are not yet sharing content, e.g., some international partners) $stats['content_partners'] = $this->content_partners(); // Number of partners with published resources $stats['content_partners_with_published_resources'] = $this->content_partners_with_published_resources(); // Number of partners with published trusted resources $stats['content_partners_with_published_trusted_resources'] = $this->content_partners_with_published_resources(1); // Total number of published resources $stats['published_resources'] = $this->published_resources(); // Number of published trusted resources $stats['published_trusted_resources'] = $this->published_resources("1"); // Number of published unreviewed resources $stats['published_unreviewed_resources'] = $this->published_resources("0"); // Number of resources published for the first time in the last 30 days $stats['newly_published_resources_in_the_last_30_days'] = $this->published_resources_in_the_last_n_days(30); print "\n Content partner stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // Page Richness Statistics $time_start = time_elapsed(); // % of all pages (total number of taxon concepts) that are rich - with a score of 40 or more $stats['rich_pages'] = $this->rich_pages(); $hotlist_taxon_concept_ids = self::get_collections_taxon_concept_ids(array(self::HOTLIST_COLLECTION_ID)); $stats['hotlist_pages'] = count($hotlist_taxon_concept_ids); // % pages on the hotlist that are rich - The official version of the hotlist (names & EOL ids) is now maintained here: $stats['rich_hotlist_pages'] = $this->get_rich_pages($hotlist_taxon_concept_ids); $redhotlist_taxon_concept_ids = self::get_collections_taxon_concept_ids(array(self::REDHOTLIST_PENDING_COLLECTION_ID, self::REDHOTLIST_COLLECTION_ID)); $stats['redhotlist_pages'] = count($redhotlist_taxon_concept_ids); // % pages on the redhotlist that are rich - the redhotlist is the combined list of taxa of these two collections $stats['rich_redhotlist_pages'] = $this->get_rich_pages($redhotlist_taxon_concept_ids); // % of all pages that are not rich but have at least some content (score 10-39) $stats['pages_with_score_10_to_39'] = $this->not_so_rich_pages(); // % of all pages that are base-like pages (score <10) $stats['pages_with_score_less_than_10'] = $this->not_rich_pages(); print "\n Page richness stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // Curatorial Stats $time_start = time_elapsed(); $this->data_object_curation_activity_ids(); $this->name_curation_activity_ids(); $this->taxa_curation_activity_ids(); $this->curation_activity_ids(); // Number of registered assistant curators $stats['curators_assistant'] = $this->curators($this->assistant_curator_id); // Number of registered full curators $stats['curators_full'] = $this->curators($this->full_curator_id); // Number of registered master curators $stats['curators_master'] = $this->curators($this->master_curator_id); // Number of registered curators $stats['curators'] = $stats['curators_assistant'] + $stats['curators_full'] + $stats['curators_master']; $stats['active_curators'] = count($this->curators_active()); // number of pages curated by active curators $stats['pages_curated_by_active_curators'] = $this->pages_curated($this->curators_active); $stats['objects_curated_in_the_last_30_days'] = $this->objects_curated_in_the_last_n_days(30); $stats['curator_actions_in_the_last_30_days'] = $this->curator_actions_in_the_last_n_days(30); print "\n Curatorial Stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // LifeDesk stats $time_start = time_elapsed(); $stats['lifedesk_taxa'] = $this->lifedesk_taxa(); $stats['lifedesk_data_objects'] = $this->lifedesk_data_objects(); print "\n LifeDesk stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // Marine stats $time_start = time_elapsed(); $stats['marine_pages'] = $this->marine_pages(); $stats['marine_pages_in_col'] = $this->marine_pages_in_col(); $stats['marine_pages_with_objects'] = $this->marine_pages_with_objects(); $stats['marine_pages_with_objects_vetted'] = $this->marine_pages_with_objects($this->trusted_id); print "\n Marine stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; // User-submitted text $time_start = time_elapsed(); // Number of user submitted text (published) $stats['udo_published'] = $this->udo_published(); // Number of text objects added by curators - assistant, full, or master curators $stats['udo_published_by_curators'] = $this->udo_published_by_curators(); // Number of text objects added by non-curators $stats['udo_published_by_non_curators'] = $stats['udo_published'] - $stats['udo_published_by_curators']; print "\n UDO stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; //Data Object Statistics $time_start = time_elapsed(); $stats['data_objects'] = $this->count_data_objects(); $stats['data_objects_texts'] = $this->count_data_objects(array(DataType::text()->id)); $stats['data_objects_images'] = $this->count_data_objects(array(DataType::image()->id)); $stats['data_objects_videos'] = $this->count_data_objects(array(DataType::video()->id, DataType::flash()->id, DataType::youtube()->id)); $stats['data_objects_sounds'] = $this->count_data_objects(array(DataType::sound()->id)); $stats['data_objects_maps'] = $this->count_data_objects(array(DataType::map()->id)); $stats['data_objects_trusted'] = count($this->count_data_objects_vettedness_list($this->trusted_id)); $stats['data_objects_unreviewed'] = count($this->count_data_objects_vettedness_list($this->unknown_id)); $stats['data_objects_untrusted'] = count($this->count_data_objects_vettedness_list($this->untrusted_id)); $stats['data_objects_trusted_or_unreviewed_but_hidden'] = count($this->data_objects_trusted_or_unreviewed_but_hidden_list()); print "\n Data object stats: " . (time_elapsed() - $time_start) / 60 . " minutes"; $stats['total_triples'] = $this->total_triples(); $stats['total_occurrences'] = $this->total_occurrences(); $stats['total_measurements'] = $this->total_measurements(); $stats['total_associations'] = $this->total_associations(); $stats['total_measurement_types'] = $this->total_measurement_types(); $stats['total_association_types'] = $this->total_association_types(); $stats['total_taxa_with_data'] = $this->total_taxa_with_data(); $stats['total_user_added_data'] = $this->total_user_added_data(); $stats['created_at'] = date('Y-m-d H:i:s'); $this->mysqli->insert("INSERT INTO eol_statistics (" . implode(array_keys($stats), ",") . ") VALUES ('" . implode($stats, "','") . "')"); print_r($stats); }