function get_data_objects_count($batch_size = 100000)
 {
     $time_start = time_elapsed();
     $concept_data_object_counts = array();
     $concept_data_object_maps = array();
     $concept_info_items = array();
     $concept_references = array();
     $image_id = DataType::image()->id;
     $map_id = DataType::map()->id;
     $text_id = DataType::text()->id;
     $video_id = DataType::video()->id;
     $sound_id = DataType::sound()->id;
     $flash_id = DataType::flash()->id;
     $youtube_id = DataType::youtube()->id;
     $iucn_id = DataType::iucn()->id;
     $data_type_label[$image_id] = 'image';
     $data_type_label[$sound_id] = 'sound';
     $data_type_label[$text_id] = 'text';
     $data_type_label[$video_id] = 'video';
     $data_type_label[$iucn_id] = 'iucn';
     $data_type_label[$flash_id] = 'flash';
     $data_type_label[$youtube_id] = 'youtube';
     $trusted_id = Vetted::trusted()->id;
     $untrusted_id = Vetted::untrusted()->id;
     $unreviewed_id = Vetted::unknown()->id;
     for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) {
         print "\n dataObjects, its infoItems, its references [2 of 14] {$i} \n";
         $sql = "SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, dohe.vetted_id, do.data_subtype_id\r\n                FROM data_objects_taxon_concepts dotc \r\n                JOIN data_objects do ON dotc.data_object_id = do.id \r\n                LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n                LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n                JOIN data_objects_hierarchy_entries dohe on do.id = dohe.data_object_id\r\n                WHERE do.published=1 AND dohe.visibility_id=" . Visibility::visible()->id . " AND dohe.vetted_id != {$untrusted_id} ";
         //." AND do.data_type_id <> $image_id "; this has to be removed to count maps
         if (isset($GLOBALS['test_taxon_concept_ids'])) {
             $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $sql .= "\r\n                UNION\r\n                SELECT dotc.taxon_concept_id tc_id, do.data_type_id, doii.info_item_id, dor.ref_id, do.description, udo.vetted_id, do.data_subtype_id\r\n                    FROM data_objects_taxon_concepts dotc \r\n                    JOIN data_objects do ON dotc.data_object_id = do.id \r\n                    LEFT JOIN data_objects_info_items doii ON do.id = doii.data_object_id \r\n                    LEFT JOIN data_objects_refs dor ON do.id = dor.data_object_id \r\n                    JOIN users_data_objects udo on do.id = udo.data_object_id\r\n                    WHERE do.published=1 AND udo.visibility_id=" . Visibility::visible()->id . "\r\n                ";
         if (isset($GLOBALS['test_taxon_concept_ids'])) {
             $sql .= " and dotc.taxon_concept_id IN (" . implode(",", $GLOBALS['test_taxon_concept_ids']) . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $outfile = $this->mysqli_slave->select_into_outfile($sql);
         $FILE = fopen($outfile, "r");
         if (!$FILE) {
             print "!! ERROR: Could not read {$outfile}";
             debug("!! ERROR: Could not read {$outfile}");
             return;
         }
         $num_rows = 0;
         while (!feof($FILE)) {
             if ($line = fgets($FILE)) {
                 $num_rows++;
                 $line = trim($line);
                 $fields = explode("\t", $line);
                 $tc_id = trim($fields[0]);
                 $data_type_id = trim($fields[1]);
                 $info_item_id = trim($fields[2]);
                 $ref_id = trim($fields[3]);
                 $description = trim($fields[4]);
                 $vetted_id = trim($fields[5]);
                 $data_subtype_id = trim($fields[6]);
                 $label = @$data_type_label[$data_type_id];
                 if ($data_subtype_id != $map_id) {
                     $words_count = str_word_count(strip_tags($description), 0);
                     @$concept_data_object_counts[$tc_id][$label]['total']++;
                     @($concept_data_object_counts[$tc_id][$label]['total_w'] += $words_count);
                     if ($vetted_id == $trusted_id) {
                         @$concept_data_object_counts[$tc_id][$label]['t']++;
                         @($concept_data_object_counts[$tc_id][$label]['t_w'] += $words_count);
                     } elseif ($vetted_id == $untrusted_id) {
                         @$concept_data_object_counts[$tc_id][$label]['ut']++;
                         @($concept_data_object_counts[$tc_id][$label]['ut_w'] += $words_count);
                     } elseif ($vetted_id == $unreviewed_id) {
                         @$concept_data_object_counts[$tc_id][$label]['ur']++;
                         @($concept_data_object_counts[$tc_id][$label]['ur_w'] += $words_count);
                     }
                     $concept_info_items[$tc_id][$info_item_id] = '';
                     $concept_references[$tc_id][$ref_id] = '';
                 } else {
                     @$concept_data_object_maps[$tc_id][$label]['total']++;
                     if ($vetted_id == $trusted_id) {
                         @$concept_data_object_maps[$tc_id][$label]['t']++;
                     } elseif ($vetted_id == $untrusted_id) {
                         @$concept_data_object_maps[$tc_id][$label]['ut']++;
                     } elseif ($vetted_id == $unreviewed_id) {
                         @$concept_data_object_maps[$tc_id][$label]['ur']++;
                     }
                 }
             }
         }
         fclose($FILE);
         unlink($outfile);
         print "\n num_rows: {$num_rows}";
     }
     self::save_to_json_file($concept_info_items, "concept_info_items");
     unset($concept_info_items);
     self::save_to_json_file($concept_references, "concept_references");
     unset($concept_references);
     //save map data to be accessed later
     self::save_to_json_file($concept_data_object_maps, "map_counts");
     unset($concept_data_object_maps);
     //convert associative array to a regular array
     $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn");
     foreach ($concept_data_object_counts as $taxon_concept_id => $taxon_object_counts) {
         $new_value = "";
         foreach ($data_type_order_in_file as $data_type) {
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['total'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['t'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['total_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['t_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ut_w'];
             $new_value .= "\t" . @$taxon_object_counts[$data_type]['ur_w'];
         }
         $concept_data_object_counts[$taxon_concept_id] = $new_value;
     }
     print "\n get_data_objects_count():" . (time_elapsed() - $time_start) / 60 . " minutes";
     self::save_totals_to_cumulative_txt($concept_data_object_counts, "tpm_data_objects");
     unset($concept_data_object_counts);
 }
 function get_data_objects_count($batch_size = 100000)
 {
     $image_id = DataType::image()->id;
     $text_id = DataType::text()->id;
     $video_id = DataType::video()->id;
     $sound_id = DataType::sound()->id;
     $flash_id = DataType::flash()->id;
     $youtube_id = DataType::youtube()->id;
     $iucn_id = DataType::iucn()->id;
     $data_type_label[$text_id] = 'text';
     $data_type_label[$video_id] = 'video';
     $data_type_label[$sound_id] = 'sound';
     $data_type_label[$flash_id] = 'flash';
     $data_type_label[$youtube_id] = 'youtube';
     $data_type_label[$iucn_id] = 'iucn';
     $data_type_order_in_file = array("text", "video", "sound", "flash", "youtube", "iucn");
     $trusted_id = Vetted::trusted()->id;
     $untrusted_id = Vetted::untrusted()->id;
     $unreviewed_id = Vetted::unknown()->id;
     $raw_stats = array();
     $concept_info_items = array();
     $concept_references = array();
     for ($i = $this->min_taxon_concept_id; $i <= $this->max_taxon_concept_id; $i += $batch_size) {
         $this->print_status($i, $batch_size);
         $sql = "SELECT  do.guid,\n                            dotc.taxon_concept_id,\n                            do.data_type_id,\n                            doii.info_item_id,\n                            dor.ref_id,\n                            REPLACE(REPLACE(do.description, '\\\\n', ' '), '\\\\r', ' '),\n                            dohe.vetted_id,\n                            do.id\n            FROM data_objects_taxon_concepts dotc\n            STRAIGHT_JOIN data_objects do ON (dotc.data_object_id = do.id)\n            JOIN data_objects_hierarchy_entries dohe ON (do.id=dohe.data_object_id)\n            LEFT JOIN data_objects_info_items doii ON (do.id = doii.data_object_id)\n            LEFT JOIN data_objects_refs dor ON (do.id = dor.data_object_id)\n            WHERE do.published = 1 AND dohe.visibility_id = " . Visibility::visible()->id . " AND do.data_type_id != {$image_id}";
         if ($this->test_taxon_concept_ids) {
             $sql .= " AND dotc.taxon_concept_id IN (" . $this->test_taxon_concept_ids . ")";
         } else {
             $sql .= " AND dotc.taxon_concept_id BETWEEN {$i} AND " . ($i + $batch_size);
         }
         $counted_data_objects = array();
         foreach ($this->mysqli_slave->iterate_file($sql) as $row_number => $row) {
             $taxon_concept_id = trim($row[0]);
             $data_type_id = trim($row[1]);
             $info_item_id = trim($row[2]);
             $ref_id = trim($row[3]);
             $description = trim($row[4]);
             $vetted_id = trim($row[5]);
             $data_object_id = trim($row[6]);
             if (isset($counted_data_objects[$taxon_concept_id][$data_object_id])) {
                 continue;
             }
             $counted_data_objects[$taxon_concept_id][$data_object_id] = 1;
             $label = @$data_type_label[$data_type_id];
             $words_count = str_word_count(strip_tags($description), 0);
             @$raw_stats[$taxon_concept_id][$label]['total']++;
             @($raw_stats[$taxon_concept_id][$label]['total_w'] += $words_count);
             if ($vetted_id == $trusted_id) {
                 @$raw_stats[$taxon_concept_id][$label]['t']++;
                 @($raw_stats[$taxon_concept_id][$label]['t_w'] += $words_count);
             } elseif ($vetted_id == $untrusted_id) {
                 @$raw_stats[$taxon_concept_id][$label]['ut']++;
                 @($raw_stats[$taxon_concept_id][$label]['ut_w'] += $words_count);
             } elseif ($vetted_id == $unreviewed_id) {
                 @$raw_stats[$taxon_concept_id][$label]['ur']++;
                 @($raw_stats[$taxon_concept_id][$label]['ur_w'] += $words_count);
             }
             $concept_info_items[$taxon_concept_id][$info_item_id] = '';
             $concept_references[$taxon_concept_id][$ref_id] = '';
         }
         foreach ($raw_stats as $taxon_concept_id => $stats) {
             $new_value = "";
             # the stats need to go into the file in a certain order to be imported into the MySQL table
             foreach ($data_type_order_in_file as $data_type) {
                 $new_value = @$stats[$data_type]['total'];
                 $new_value .= "\t" . @$stats[$data_type]['t'];
                 $new_value .= "\t" . @$stats[$data_type]['ut'];
                 $new_value .= "\t" . @$stats[$data_type]['ur'];
                 $new_value .= "\t" . @$stats[$data_type]['total_w'];
                 $new_value .= "\t" . @$stats[$data_type]['t_w'];
                 $new_value .= "\t" . @$stats[$data_type]['ut_w'];
                 $new_value .= "\t" . @$stats[$data_type]['ur_w'];
             }
             $raw_stats[$taxon_concept_id] = $new_value;
         }
         $this->save_category_stats($raw_stats, "get_data_objects_count");
         $raw_stats = array();
         if ($this->test_taxon_concept_ids) {
             break;
         }
     }
     // $this->save_to_json_file($concept_info_items, "concept_info_items");
     // unset($concept_info_items);
     //
     // $this->save_to_json_file($concept_references, "concept_references");
     // unset($concept_references);
 }
Example #3
0
 public function save_eol_stats()
 {
     $stats = array();
     // Overall Statistics
     $time_start = time_elapsed();
     // Number of members
     $stats['members_count'] = $this->members_count();
     // Number of communities
     $stats['communities_count'] = $this->communities_count();
     // Number of collections
     $stats['collections_count'] = $this->collections_count();
     // Total number of pages
     $stats['pages_count'] = $this->pages_count();
     // as currently reported on home page; assume this means pages with at least one data object
     $stats['pages_with_content'] = $this->pages_with_content();
     $stats['pages_with_text'] = $this->pages_with_text();
     $stats['pages_with_image'] = $this->pages_with_image();
     $stats['pages_with_map'] = $this->pages_with_map();
     $stats['pages_with_video'] = $this->pages_with_video();
     $stats['pages_with_sound'] = $this->pages_with_sound();
     $stats['pages_without_text'] = $stats['pages_count'] - $this->pages_with_text();
     $stats['pages_without_image'] = $stats['pages_count'] - $stats['pages_with_image'];
     $stats['pages_with_image_no_text'] = $this->pages_with_image_no_text();
     $stats['pages_with_text_no_image'] = $this->pages_with_text_no_image();
     // base pages - pages without any data objects; base pages may have references and BHL/content partner links
     $stats['base_pages'] = $this->pages_without_content_with_other_info();
     print "\n Overall stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // Trusted Content Statistics - note change in terminology, phasing out vetted in favor of trusted
     $time_start = time_elapsed();
     // Number of pages with at least one trusted data object
     $stats['pages_with_at_least_a_trusted_object'] = $this->pages_with_at_least_a_trusted_object();
     $stats['pages_with_at_least_a_curatorial_action'] = $this->pages_curated();
     print "\n Trusted content stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // BHL Statistics
     $time_start = time_elapsed();
     $stats['pages_with_BHL_links'] = $this->pages_with_BHL_links();
     $stats['pages_with_BHL_links_no_text'] = $this->pages_with_BHL_links_no_text();
     $stats['pages_with_BHL_links_only'] = $this->pages_with_BHL_links_only();
     print "\n BHL stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // Content Partner Statistics
     $time_start = time_elapsed();
     // Number of publicly listed partners - as shown on home page (This includes all published partners
     // and a few partners that have been listed although they are not yet sharing content, e.g., some international partners)
     $stats['content_partners'] = $this->content_partners();
     // Number of partners with published resources
     $stats['content_partners_with_published_resources'] = $this->content_partners_with_published_resources();
     // Number of partners with published trusted resources
     $stats['content_partners_with_published_trusted_resources'] = $this->content_partners_with_published_resources(1);
     // Total number of published resources
     $stats['published_resources'] = $this->published_resources();
     // Number of published trusted resources
     $stats['published_trusted_resources'] = $this->published_resources("1");
     // Number of published unreviewed resources
     $stats['published_unreviewed_resources'] = $this->published_resources("0");
     // Number of resources published for the first time in the last 30 days
     $stats['newly_published_resources_in_the_last_30_days'] = $this->published_resources_in_the_last_n_days(30);
     print "\n Content partner stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // Page Richness Statistics
     $time_start = time_elapsed();
     // % of all pages (total number of taxon concepts) that are rich - with a score of 40 or more
     $stats['rich_pages'] = $this->rich_pages();
     $hotlist_taxon_concept_ids = self::get_collections_taxon_concept_ids(array(self::HOTLIST_COLLECTION_ID));
     $stats['hotlist_pages'] = count($hotlist_taxon_concept_ids);
     // % pages on the hotlist that are rich - The official version of the hotlist (names & EOL ids) is now maintained here:
     $stats['rich_hotlist_pages'] = $this->get_rich_pages($hotlist_taxon_concept_ids);
     $redhotlist_taxon_concept_ids = self::get_collections_taxon_concept_ids(array(self::REDHOTLIST_PENDING_COLLECTION_ID, self::REDHOTLIST_COLLECTION_ID));
     $stats['redhotlist_pages'] = count($redhotlist_taxon_concept_ids);
     // % pages on the redhotlist that are rich - the redhotlist is the combined list of taxa of these two collections
     $stats['rich_redhotlist_pages'] = $this->get_rich_pages($redhotlist_taxon_concept_ids);
     // % of all pages that are not rich but have at least some content (score 10-39)
     $stats['pages_with_score_10_to_39'] = $this->not_so_rich_pages();
     // % of all pages that are base-like pages (score <10)
     $stats['pages_with_score_less_than_10'] = $this->not_rich_pages();
     print "\n Page richness stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // Curatorial Stats
     $time_start = time_elapsed();
     $this->data_object_curation_activity_ids();
     $this->name_curation_activity_ids();
     $this->taxa_curation_activity_ids();
     $this->curation_activity_ids();
     // Number of registered assistant curators
     $stats['curators_assistant'] = $this->curators($this->assistant_curator_id);
     // Number of registered full curators
     $stats['curators_full'] = $this->curators($this->full_curator_id);
     // Number of registered master curators
     $stats['curators_master'] = $this->curators($this->master_curator_id);
     // Number of registered curators
     $stats['curators'] = $stats['curators_assistant'] + $stats['curators_full'] + $stats['curators_master'];
     $stats['active_curators'] = count($this->curators_active());
     // number of pages curated by active curators
     $stats['pages_curated_by_active_curators'] = $this->pages_curated($this->curators_active);
     $stats['objects_curated_in_the_last_30_days'] = $this->objects_curated_in_the_last_n_days(30);
     $stats['curator_actions_in_the_last_30_days'] = $this->curator_actions_in_the_last_n_days(30);
     print "\n Curatorial Stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // LifeDesk stats
     $time_start = time_elapsed();
     $stats['lifedesk_taxa'] = $this->lifedesk_taxa();
     $stats['lifedesk_data_objects'] = $this->lifedesk_data_objects();
     print "\n LifeDesk stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // Marine stats
     $time_start = time_elapsed();
     $stats['marine_pages'] = $this->marine_pages();
     $stats['marine_pages_in_col'] = $this->marine_pages_in_col();
     $stats['marine_pages_with_objects'] = $this->marine_pages_with_objects();
     $stats['marine_pages_with_objects_vetted'] = $this->marine_pages_with_objects($this->trusted_id);
     print "\n Marine stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     // User-submitted text
     $time_start = time_elapsed();
     // Number of user submitted text (published)
     $stats['udo_published'] = $this->udo_published();
     // Number of text objects added by curators - assistant, full, or master curators
     $stats['udo_published_by_curators'] = $this->udo_published_by_curators();
     // Number of text objects added by non-curators
     $stats['udo_published_by_non_curators'] = $stats['udo_published'] - $stats['udo_published_by_curators'];
     print "\n UDO stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     //Data Object Statistics
     $time_start = time_elapsed();
     $stats['data_objects'] = $this->count_data_objects();
     $stats['data_objects_texts'] = $this->count_data_objects(array(DataType::text()->id));
     $stats['data_objects_images'] = $this->count_data_objects(array(DataType::image()->id));
     $stats['data_objects_videos'] = $this->count_data_objects(array(DataType::video()->id, DataType::flash()->id, DataType::youtube()->id));
     $stats['data_objects_sounds'] = $this->count_data_objects(array(DataType::sound()->id));
     $stats['data_objects_maps'] = $this->count_data_objects(array(DataType::map()->id));
     $stats['data_objects_trusted'] = count($this->count_data_objects_vettedness_list($this->trusted_id));
     $stats['data_objects_unreviewed'] = count($this->count_data_objects_vettedness_list($this->unknown_id));
     $stats['data_objects_untrusted'] = count($this->count_data_objects_vettedness_list($this->untrusted_id));
     $stats['data_objects_trusted_or_unreviewed_but_hidden'] = count($this->data_objects_trusted_or_unreviewed_but_hidden_list());
     print "\n Data object stats: " . (time_elapsed() - $time_start) / 60 . " minutes";
     $stats['total_triples'] = $this->total_triples();
     $stats['total_occurrences'] = $this->total_occurrences();
     $stats['total_measurements'] = $this->total_measurements();
     $stats['total_associations'] = $this->total_associations();
     $stats['total_measurement_types'] = $this->total_measurement_types();
     $stats['total_association_types'] = $this->total_association_types();
     $stats['total_taxa_with_data'] = $this->total_taxa_with_data();
     $stats['total_user_added_data'] = $this->total_user_added_data();
     $stats['created_at'] = date('Y-m-d H:i:s');
     $this->mysqli->insert("INSERT INTO eol_statistics (" . implode(array_keys($stats), ",") . ") VALUES ('" . implode($stats, "','") . "')");
     print_r($stats);
 }