function _get_word_list($projectid, $timeCutoff) { $messages = array(); // load the suggestions $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff); if (!is_array($suggestions)) { $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions); return array(array(), array(), array(), array(), array(), array(), $messages); } if (count($suggestions) == 0) { return array(array(), array(), array(), array(), array(), array(), $messages); } // load project good words $project_good_words = load_project_good_words($projectid); // load project bad words $project_bad_words = load_project_bad_words($projectid); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res)); // array to hold all words $all_suggestions = array(); $round_page_count = array(); // parse the suggestions complex array // it is in the format: $suggestions[$round][$pagenum]=$wordsArray foreach ($suggestions as $round => $pageArray) { $round_suggestions = array(); foreach ($pageArray as $page => $words) { // add the words to the per-round array $round_suggestions = array_merge($round_suggestions, $words); // add the words to the combined array too $all_suggestions = array_merge($all_suggestions, $words); @$round_page_count[$round]++; } // remove any words already on the project's good or bad words lists $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words)); // get the suggestion occurrences $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions); // get suggestion with project word frequency $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]); // sort the list by frequency, then by word array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]); } // now, remove any words that are already on the project's good or bad words lists $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words)); // get the number of suggestion occurrences $all_suggestions_w_occurrences = generate_frequencies($all_suggestions); // $all_suggestions doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $all_suggestions. $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($all_suggestions_w_freq); // sort the list by frequency, then by word array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq); // get a list of all rounds $rounds = array_keys($round_suggestions_w_freq); return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages); }
$instances = 1; // what are the cutoff options? $cutoffOptions = array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90); // what is the initial cutoff frequency? $initialFreq = getInitialCutoff(50, $cutoffOptions, $percent_changed); // echo page support text, like JS and stylesheets echo_cutoff_script($cutoffOptions, $instances); echo_word_freq_style(); echo "<p>{$page_text}</p>"; echo_page_instruction_text("bad", $format); echo_any_warnings_errors($messages); echo_download_text($projectid, $format); // output customized cutoff text $cutoff_text = sprintf(_("Words with fewer than <b><span id='current_cutoff'>%d</span>%%</b> of the instances changed are not shown. Other cutoff options are available: %s"), $initialFreq, get_cutoff_string($cutoffOptions, "%")); echo "<p>{$cutoff_text}</p>\n"; $project_good_words = load_project_good_words($projectid); $word_checkbox = build_checkbox_array($percent_changed); $context_array = build_context_array_links($instances_left, $projectid); // build the word_note and the instances_total arrays $word_notes = array(); $instances_total = array(); foreach ($instances_left as $word => $freq) { if (in_array($word, $project_good_words)) { $word_notes[$word] = _("On project GWL"); } $instances_total[$word] = $instances_changed[$word] + $instances_left[$word]; } $word_notes["[[TITLE]]"] = _("Notes"); $percent_changed["[[TITLE]]"] = _("% Changed"); $percent_changed["[[STYLE]]"] = "text-align: right;"; $instances_changed_to["[[TITLE]]"] = _("Last changed to");
function save_to_db() { global $projects_dir, $pguser; $postednum_str = $this->postednum == "" ? "NULL" : "'{$this->postednum}'"; // Call addslashes() on any members of $this that might contain // single-quotes/apostrophes (because they are unescaped, and // would otherwise break the query). $common_project_settings = "\n t_last_edit = UNIX_TIMESTAMP(),\n up_projectid = '{$this->up_projectid}',\n nameofwork = '" . addslashes($this->nameofwork) . "',\n authorsname = '" . addslashes($this->authorsname) . "',\n language = '{$this->language}',\n genre = '{$this->genre}',\n difficulty = '{$this->difficulty_level}',\n special_code = '{$this->special_code}',\n clearance = '" . addslashes($this->clearance) . "',\n comments = '" . addslashes($this->comments) . "',\n image_source = '{$this->image_source}',\n scannercredit = '" . addslashes($this->scannercredit) . "',\n checkedoutby = '{$this->checkedoutby}',\n postednum = {$postednum_str},\n image_preparer = '{$this->image_preparer}',\n text_preparer = '{$this->text_preparer}',\n extra_credits = '" . addslashes($this->extra_credits) . "',\n deletion_reason= '" . addslashes($this->deletion_reason) . "'\n "; $pm_setter = ''; if (user_is_a_sitemanager()) { // can change PM $pm_setter = " username = '******',"; } else { if (isset($this->clone_projectid)) { // cloning a project. The PM should be the same as // that of the project being cloned, if the user // isn't an SA $res = mysql_query("\n SELECT username\n FROM projects\n WHERE projectid='{$this->clone_projectid}'\n ") or die(mysql_error()); list($projectmanager) = mysql_fetch_row($res); $pm_setter = " username = '******',"; } } if (isset($this->projectid)) { // We are updating an already-existing project. // needn't change $pm_setter, as there is no change if the user // isn't an SA // find out what we are changing from $old_pih = new ProjectInfoHolder(); $fatal_error = $old_pih->set_from_db(TRUE, $this->projectid); if ($fatal_error != '') { $fatal_error = _('site error') . ': ' . $fatal_error; echo "<br><center><font size='+1' color='#ff0000'><b>{$fatal_error}</b></font></center>"; exit; } $changed_fields = get_changed_fields($this, $old_pih); // We're particularly interested in knowing // when the project comments change. if (!in_array('comments', $changed_fields)) { // no change $tlcc_setter = ''; } else { // changed! $tlcc_setter = 't_last_change_comments = UNIX_TIMESTAMP(),'; } // We also want to know if the edit is resulting in the project // effectively being checked out to a new PPer if ($old_pih->state == PROJ_POST_FIRST_CHECKED_OUT && in_array('checkedoutby', $changed_fields)) { $md_setter = 'modifieddate = UNIX_TIMESTAMP(),'; $PPer_checkout = TRUE; } else { $md_setter = ''; $PPer_checkout = FALSE; } // Update the projects database with the updated info mysql_query("\n UPDATE projects SET\n {$pm_setter}\n {$tlcc_setter}\n {$md_setter}\n {$common_project_settings}\n WHERE projectid='{$this->projectid}'\n ") or die(mysql_error()); $details1 = implode(' ', $changed_fields); if ($details1 == '') { // There are no changed fields. // Don't just save '' for the details1 column, // because then do_history() won't be able to distinguish // this case (no changed fields) from old cases // (edit occurred before we started recording changed fields). // Instead, use a special value. $details1 = 'NONE'; } $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'edit', $details1); if (!empty($e)) { die($e); } if ($PPer_checkout) { // we fake the project transition... $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'transition', PROJ_POST_FIRST_CHECKED_OUT, PROJ_POST_FIRST_CHECKED_OUT, $this->checkedoutby); if (!empty($e)) { die($e); } } // Update the MARC record with any info we've received. $project = new Project($this->projectid); $marc_record = $project->load_marc_record(); $this->update_marc_record_from_post($marc_record); $project->save_marc_record($marc_record); } else { // We are creating a new project $this->projectid = uniqid("projectID"); // The project ID if ('' == $pm_setter) { $pm_setter = "username = '******',"; } // Insert a new row into the projects table mysql_query("\n INSERT INTO projects\n SET\n projectid = '{$this->projectid}',\n {$pm_setter}\n state = '" . PROJ_NEW . "',\n modifieddate = UNIX_TIMESTAMP(),\n t_last_change_comments = UNIX_TIMESTAMP(),\n {$common_project_settings}\n ") or die(mysql_error()); $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'creation'); if (!empty($e)) { die($e); } $e = project_allow_pages($this->projectid); if (!empty($e)) { die($e); } // Make a directory in the projects_dir for this project mkdir("{$projects_dir}/{$this->projectid}", 0777) or die("System error: unable to mkdir '{$projects_dir}/{$this->projectid}'"); chmod("{$projects_dir}/{$this->projectid}", 0777); // Do MARC record manipulations $project = new Project($this->projectid); $marc_record = new MARCRecord(); // Save original MARC record, if provided $yaz_array = unserialize(base64_decode($this->original_marc_array_encd)); if ($yaz_array !== FALSE) { $marc_record->load_yaz_array($yaz_array); $project->init_marc_record($marc_record); // Update the MARC record with data from POST $this->update_marc_record_from_post($marc_record); $project->save_marc_record($marc_record); } // Create the project's 'good word list' and 'bad word list'. if (isset($this->clone_projectid)) { // We're creating a project via cloning. // Copy the original project's word-lists. $good_words = load_project_good_words($this->clone_projectid); if (is_string($good_words)) { // It's an error message. echo "{$good_words}<br>\n"; $good_words = array(); } $bad_words = load_project_bad_words($this->clone_projectid); if (is_string($bad_words)) { // It's an error message. echo "{$bad_words}<br>\n"; $bad_words = array(); } } else { // We're creating a project by means other than cloning // (from_nothing, from_marc_record, from_uberproject). // Initialize its GWL and BWL to empty. $good_words = array(); $bad_words = array(); } save_project_good_words($this->projectid, $good_words); save_project_bad_words($this->projectid, $bad_words); } // Create/update the Dublin Core file for the project. // When we get here, the project's database entry has been fully // updated, so we can create a Project object and allow it // to pull the relevant fields from the database. $project = new Project($this->projectid); $project->create_dc_xml_oai($marc_record); // If the project has been posted to PG, make the appropriate transition. if ($this->posted) { $err = project_transition($this->projectid, PROJ_SUBMIT_PG_POSTED, $pguser); if ($err != '') { echo "{$err}<br>\n"; exit; } } }
function _handle_action($action, $list_type, $language, $cutoff, $lang_match) { $display_list = FALSE; switch ($action) { case "show": $word_freq = array(); $total_projects = 0; $total_projects_with_words = 0; // figure out what kind of language matching we're going to use $where_clause = ""; switch ($lang_match) { case "exact": $where_clause = "language = '{$language}'"; break; case "primary": $where_clause = "language like '{$language}%'"; break; case "any": $where_clause = "language like '%{$language}%'"; break; default: die("Unknown language match used: {$lang_match}"); } // loop through all projects that use $language $res = mysql_query("\n SELECT projectid\n FROM projects\n WHERE {$where_clause}\n "); while (list($projectid) = mysql_fetch_row($res)) { if ($list_type == "good") { $words = load_project_good_words($projectid); } elseif ($list_type == "bad") { $words = load_project_bad_words($projectid); } else { die("Unknown list type: {$list_type}"); } foreach ($words as $word) { @$word_freq[$word]++; } if (count($words)) { $total_projects_with_words++; } $total_projects++; } mysql_free_result($res); // sort the results arsort($word_freq); // show the results echo "<pre>"; echo _("Language") . ": {$language}<br>"; echo sprintf(_("Language match type: %s"), $lang_match) . "<br>"; echo sprintf(_("Word list type: %s"), $list_type) . "<br>"; echo sprintf(_("Cutoff percentage: %d%%"), $cutoff) . "<br>"; echo sprintf(_("Total projects matching language: %d"), $total_projects) . "<br>"; echo sprintf(_("Total projects with word lists: %d"), $total_projects_with_words) . "<br>"; echo "<br>"; echo _("Note: Percentages are calculated as frequency over the total number of projects with word lists.") . "<br>"; echo "<br>"; echo sprintf("%20s %5s %s<br>", _("Word"), _("Count"), _("Frequency")); foreach ($word_freq as $word => $freq) { $percentage = $freq / $total_projects_with_words * 100; if ($percentage < $cutoff) { break; } echo sprintf("%20s %5d (%-3.2f%%)<br>", $word, $freq, $percentage); } echo "</pre>"; break; case "list": $display_list = TRUE; break; default: die("Invalid action encountered."); } return $display_list; }
function merge_wordcheck_files($from_id, $to_id) { global $projects_dir; // good words $from_words = load_project_good_words($from_id); $to_words = load_project_good_words($to_id); $to_words = array_merge($to_words, $from_words); save_project_good_words($to_id, $to_words); // crying out for some abstraction here? // bad words $from_words = load_project_bad_words($from_id); $to_words = load_project_bad_words($to_id); $to_words = array_merge($to_words, $from_words); save_project_bad_words($to_id, $to_words); // suggestions // the file format is complicated and may change // so we take the sledgehammer approach, as suggested by cpeel... $from_path = "{$projects_dir}/{$from_id}/good_word_suggestions.txt"; if (!is_file($from_path)) { // The file does not exist. // Treat that the same as if it existed and was empty. $from_suggs = ""; } else { $from_suggs = file_get_contents($from_path); } $to_path = "{$projects_dir}/{$to_id}/good_word_suggestions.txt"; if (!is_file($to_path)) { // The file does not exist. // Treat that the same as if it existed and was empty. $to_suggs = ""; } else { $to_suggs = file_get_contents($to_path); } file_put_contents($to_path, $to_suggs . $from_suggs); // we're assuming the projects are in unavailable or waiting, so there // is going to be no need to put locks on the files or anything fancy }
function set_from_files($load_good_words = true, $load_bad_words = true) { $errors = array(); if ($load_good_words) { $gwl_object = get_project_word_file($this->projectid, "good"); $this->gwl_timestamp = $gwl_object->mod_time; $good_words = load_project_good_words($this->projectid); if (is_string($good_words)) { array_push($errors, $good_words); $this->good_words = ''; } else { $this->good_words = implode("\n", $good_words); } } if ($load_bad_words) { $bwl_object = get_project_word_file($this->projectid, "bad"); $this->bwl_timestamp = $bwl_object->mod_time; $bad_words = load_project_bad_words($this->projectid); if (is_string($bad_words)) { array_push($errors, $bad_words); $this->bad_words = ''; } else { $this->bad_words = implode("\n", $bad_words); } } return $errors; }