function _get_word_list($projectid, $timeCutoff) { $messages = array(); // load the suggestions $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff); if (!is_array($suggestions)) { $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions); return array(array(), array(), array(), array(), array(), array(), $messages); } if (count($suggestions) == 0) { return array(array(), array(), array(), array(), array(), array(), $messages); } // load project good words $project_good_words = load_project_good_words($projectid); // load project bad words $project_bad_words = load_project_bad_words($projectid); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res)); // array to hold all words $all_suggestions = array(); $round_page_count = array(); // parse the suggestions complex array // it is in the format: $suggestions[$round][$pagenum]=$wordsArray foreach ($suggestions as $round => $pageArray) { $round_suggestions = array(); foreach ($pageArray as $page => $words) { // add the words to the per-round array $round_suggestions = array_merge($round_suggestions, $words); // add the words to the combined array too $all_suggestions = array_merge($all_suggestions, $words); @$round_page_count[$round]++; } // remove any words already on the project's good or bad words lists $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words)); // get the suggestion occurrences $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions); // get suggestion with project word frequency $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]); // sort the list by frequency, then by word array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]); } // now, remove any words that are already on the project's good or bad words lists $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words)); // get the number of suggestion occurrences $all_suggestions_w_occurrences = generate_frequencies($all_suggestions); // $all_suggestions doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $all_suggestions. $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($all_suggestions_w_freq); // sort the list by frequency, then by word array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq); // get a list of all rounds $rounds = array_keys($round_suggestions_w_freq); return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages); }
function _get_word_list($projectid) { $messages = array(); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res)); // load site word lists for project languages $site_possible_bad_words = load_site_possible_bad_words_given_project($projectid); // now, remove any words that are already on the project's bad word list $site_possible_bad_words = array_diff($site_possible_bad_words, load_project_bad_words($projectid)); // $site_possible_bad_words doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $bad_words. $bad_words_w_freq = array_intersect_key($all_words_w_freq, array_flip($site_possible_bad_words)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($bad_words_w_freq); // sort the list by frequency, then by word array_multisort(array_values($bad_words_w_freq), SORT_DESC, array_map('strtolower', array_keys($bad_words_w_freq)), SORT_ASC, $bad_words_w_freq); return array($bad_words_w_freq, $messages); }
function _get_word_list($projectid) { global $aspell_temp_dir; $ocr_filename = "{$aspell_temp_dir}/{$projectid}_ocr.txt"; $latest_filename = "{$aspell_temp_dir}/{$projectid}_latest.txt"; $messages = array(); // get the OCR text // Note: If the code is changed to allow selecting of arbitrary round text // instead of just the OCR round, edit_project_word_lists.php should // be updated to allow this page to be accessed for those type-in // projects with no OCR text. $pages_res = page_info_query($projectid, '[OCR]', 'LE'); $all_page_text = get_page_texts($pages_res); // remove any formatting tags and add a final \r\n to each page-text // to ensure that there is whitespace between pages so they don't run together $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text); file_put_contents($ocr_filename, $all_page_text); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_page_text = get_page_texts($pages_res); // remove any formatting tags and add a final \r\n to each page-text // to ensure that there is whitespace between pages so they don't run together $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text); file_put_contents($latest_filename, $all_page_text); $all_words_w_freq = get_distinct_words_in_text($all_page_text); // clean up unused variables unset($all_page_text); // make external call to wdiff exec("wdiff -3 {$ocr_filename} {$latest_filename}", $wdiff_output, $return_code); // check to see if wdiff wasn't found to execute if ($return_code == 127) { die("Error invoking wdiff to do the diff analysis. Perhaps it is not installed."); } if ($return_code == 2) { die("Error reported from wdiff while attempting to do the diff analysis."); } // clean up the temporary files if (is_file($ocr_filename)) { unlink($ocr_filename); } if (is_file($latest_filename)) { unlink($latest_filename); } // specify the separator between the wdiff segments $separator = '======================================================================'; $possible_scannos_w_correction = array(); $possible_scannos_w_count = array(); // parse the incoming data one segment at a time // from the original datastream to conserve memory $lineIndex = 0; $totalLines = count($wdiff_output); while ($lineIndex < $totalLines) { // pull the next segment $segment = ""; while ($lineIndex <= $totalLines) { $line = $wdiff_output[$lineIndex]; $lineIndex++; if ($line == $separator) { break; } $segment .= "{$line}\n"; } // note that we're handling the case where two adjacent // words are updated $ocr_words = $latest_words = array(); // pull out the original word(s) if (preg_match("/\\[-(.*?)-\\]/", $segment, $matches)) { $ocr_words = $matches[1]; $ocr_words = get_all_words_in_text($ocr_words); } // if we don't have any ocr_words (probably because // the correction spanned lines) then don't bother // continuing with this segment if (!count($ocr_words)) { continue; } // pull out the replacement(s) if (preg_match("/{\\+(.*?)\\+}/", $segment, $matches)) { $latest_words = $matches[1]; $latest_words = get_all_words_in_text($latest_words); } // if the number of words isn't the same between the two // bail since we don't handle that case yet if (count($ocr_words) != count($latest_words)) { continue; } // process the words, handles multi-words strings for ($index = 0; $index < count($ocr_words); $index++) { $ocr_word = $ocr_words[$index]; $latest_word = $latest_words[$index]; // if the words are the same or one of them empty, skip it if ($ocr_word == $latest_word || empty($ocr_word) || empty($latest_word)) { continue; } $possible_scannos_w_correction[$ocr_word] = $latest_word; @$possible_scannos_w_count[$ocr_word]++; } } // $wdiff_output can be very large // so unset it here to be nice for the rest of the function unset($wdiff_output); $possible_scannos = array_keys($possible_scannos_w_correction); // create a string of words to run through WordCheck $text_to_check = implode(" ", $possible_scannos); // run the list through WordCheck to see which it would flag list($possible_scannos_via_wordcheck, $languages, $messages) = get_bad_words_for_text($text_to_check, $projectid, 'all', '', array(), 'FREQS'); // load site words $site_bad_words = load_site_bad_words_given_project($projectid); // load the project bad words $project_bad_words = load_project_bad_words($projectid); // remove words that WordCheck would flag $possible_scannos = array_diff($possible_scannos, array_keys($possible_scannos_via_wordcheck)); // remove any scannos already on the site and project bad word lists $possible_scannos = array_diff($possible_scannos, $site_bad_words, $project_bad_words); // $possible_scannos doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $possible_scannos $possible_scannos_w_freq = array_intersect_key($all_words_w_freq, array_flip($possible_scannos)); $percent_changed = array(); foreach ($possible_scannos as $word) { $count = $possible_scannos_w_count[$word]; $totalInstances = @$possible_scannos_w_freq[$word] + $count; $percent_changed[$word] = sprintf("%0.2f", $count / $totalInstances * 100); if ($percent_changed[$word] >= 100 && $totalInstances == 1) { unset($percent_changed[$word]); } } // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($percent_changed); // sort the list by frequency, then by word array_multisort(array_values($percent_changed), SORT_DESC, array_map('strtolower', array_keys($percent_changed)), SORT_ASC, $percent_changed); return array($percent_changed, $possible_scannos_w_freq, $messages, $possible_scannos_w_correction, $possible_scannos_w_count); }
function save_to_db() { global $projects_dir, $pguser; $postednum_str = $this->postednum == "" ? "NULL" : "'{$this->postednum}'"; // Call addslashes() on any members of $this that might contain // single-quotes/apostrophes (because they are unescaped, and // would otherwise break the query). $common_project_settings = "\n t_last_edit = UNIX_TIMESTAMP(),\n up_projectid = '{$this->up_projectid}',\n nameofwork = '" . addslashes($this->nameofwork) . "',\n authorsname = '" . addslashes($this->authorsname) . "',\n language = '{$this->language}',\n genre = '{$this->genre}',\n difficulty = '{$this->difficulty_level}',\n special_code = '{$this->special_code}',\n clearance = '" . addslashes($this->clearance) . "',\n comments = '" . addslashes($this->comments) . "',\n image_source = '{$this->image_source}',\n scannercredit = '" . addslashes($this->scannercredit) . "',\n checkedoutby = '{$this->checkedoutby}',\n postednum = {$postednum_str},\n image_preparer = '{$this->image_preparer}',\n text_preparer = '{$this->text_preparer}',\n extra_credits = '" . addslashes($this->extra_credits) . "',\n deletion_reason= '" . addslashes($this->deletion_reason) . "'\n "; $pm_setter = ''; if (user_is_a_sitemanager()) { // can change PM $pm_setter = " username = '******',"; } else { if (isset($this->clone_projectid)) { // cloning a project. The PM should be the same as // that of the project being cloned, if the user // isn't an SA $res = mysql_query("\n SELECT username\n FROM projects\n WHERE projectid='{$this->clone_projectid}'\n ") or die(mysql_error()); list($projectmanager) = mysql_fetch_row($res); $pm_setter = " username = '******',"; } } if (isset($this->projectid)) { // We are updating an already-existing project. // needn't change $pm_setter, as there is no change if the user // isn't an SA // find out what we are changing from $old_pih = new ProjectInfoHolder(); $fatal_error = $old_pih->set_from_db(TRUE, $this->projectid); if ($fatal_error != '') { $fatal_error = _('site error') . ': ' . $fatal_error; echo "<br><center><font size='+1' color='#ff0000'><b>{$fatal_error}</b></font></center>"; exit; } $changed_fields = get_changed_fields($this, $old_pih); // We're particularly interested in knowing // when the project comments change. if (!in_array('comments', $changed_fields)) { // no change $tlcc_setter = ''; } else { // changed! $tlcc_setter = 't_last_change_comments = UNIX_TIMESTAMP(),'; } // We also want to know if the edit is resulting in the project // effectively being checked out to a new PPer if ($old_pih->state == PROJ_POST_FIRST_CHECKED_OUT && in_array('checkedoutby', $changed_fields)) { $md_setter = 'modifieddate = UNIX_TIMESTAMP(),'; $PPer_checkout = TRUE; } else { $md_setter = ''; $PPer_checkout = FALSE; } // Update the projects database with the updated info mysql_query("\n UPDATE projects SET\n {$pm_setter}\n {$tlcc_setter}\n {$md_setter}\n {$common_project_settings}\n WHERE projectid='{$this->projectid}'\n ") or die(mysql_error()); $details1 = implode(' ', $changed_fields); if ($details1 == '') { // There are no changed fields. // Don't just save '' for the details1 column, // because then do_history() won't be able to distinguish // this case (no changed fields) from old cases // (edit occurred before we started recording changed fields). // Instead, use a special value. $details1 = 'NONE'; } $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'edit', $details1); if (!empty($e)) { die($e); } if ($PPer_checkout) { // we fake the project transition... $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'transition', PROJ_POST_FIRST_CHECKED_OUT, PROJ_POST_FIRST_CHECKED_OUT, $this->checkedoutby); if (!empty($e)) { die($e); } } // Update the MARC record with any info we've received. $project = new Project($this->projectid); $marc_record = $project->load_marc_record(); $this->update_marc_record_from_post($marc_record); $project->save_marc_record($marc_record); } else { // We are creating a new project $this->projectid = uniqid("projectID"); // The project ID if ('' == $pm_setter) { $pm_setter = "username = '******',"; } // Insert a new row into the projects table mysql_query("\n INSERT INTO projects\n SET\n projectid = '{$this->projectid}',\n {$pm_setter}\n state = '" . PROJ_NEW . "',\n modifieddate = UNIX_TIMESTAMP(),\n t_last_change_comments = UNIX_TIMESTAMP(),\n {$common_project_settings}\n ") or die(mysql_error()); $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'creation'); if (!empty($e)) { die($e); } $e = project_allow_pages($this->projectid); if (!empty($e)) { die($e); } // Make a directory in the projects_dir for this project mkdir("{$projects_dir}/{$this->projectid}", 0777) or die("System error: unable to mkdir '{$projects_dir}/{$this->projectid}'"); chmod("{$projects_dir}/{$this->projectid}", 0777); // Do MARC record manipulations $project = new Project($this->projectid); $marc_record = new MARCRecord(); // Save original MARC record, if provided $yaz_array = unserialize(base64_decode($this->original_marc_array_encd)); if ($yaz_array !== FALSE) { $marc_record->load_yaz_array($yaz_array); $project->init_marc_record($marc_record); // Update the MARC record with data from POST $this->update_marc_record_from_post($marc_record); $project->save_marc_record($marc_record); } // Create the project's 'good word list' and 'bad word list'. if (isset($this->clone_projectid)) { // We're creating a project via cloning. // Copy the original project's word-lists. $good_words = load_project_good_words($this->clone_projectid); if (is_string($good_words)) { // It's an error message. echo "{$good_words}<br>\n"; $good_words = array(); } $bad_words = load_project_bad_words($this->clone_projectid); if (is_string($bad_words)) { // It's an error message. echo "{$bad_words}<br>\n"; $bad_words = array(); } } else { // We're creating a project by means other than cloning // (from_nothing, from_marc_record, from_uberproject). // Initialize its GWL and BWL to empty. $good_words = array(); $bad_words = array(); } save_project_good_words($this->projectid, $good_words); save_project_bad_words($this->projectid, $bad_words); } // Create/update the Dublin Core file for the project. // When we get here, the project's database entry has been fully // updated, so we can create a Project object and allow it // to pull the relevant fields from the database. $project = new Project($this->projectid); $project->create_dc_xml_oai($marc_record); // If the project has been posted to PG, make the appropriate transition. if ($this->posted) { $err = project_transition($this->projectid, PROJ_SUBMIT_PG_POSTED, $pguser); if ($err != '') { echo "{$err}<br>\n"; exit; } } }
$cutoffOptions = array(1, 2, 3, 4, 5, 10, 25, 50); // what is the initial cutoff frequency? $initialFreq = getInitialCutoff($freqCutoff, $cutoffOptions, $bad_words_w_freq); // echo page support text, like JS and stylesheets echo_cutoff_script($cutoffOptions, $instances); echo_word_freq_style(); echo "<p>{$page_text}</p>"; echo_page_instruction_text("good", $format); echo_download_text($projectid, $format); echo_any_warnings_errors($messages); echo_cutoff_text($initialFreq, $cutoffOptions); $context_array = build_context_array_links($bad_words_w_freq, $projectid); // load site bad words $site_bad_words = load_site_bad_words_given_project($projectid); // load project bad words $project_bad_words = load_project_bad_words($projectid); $word_notes = array(); foreach ($site_bad_words as $word) { $word_notes[$word] = _("On site BWL"); } foreach ($project_bad_words as $word) { $word_notes[$word] = _("On project BWL"); } $context_array["[[TITLE]]"] = _("Show Context"); $word_notes["[[TITLE]]"] = _("Notes"); if (isset($update_status) && $update_status !== "Success") { echo_any_warnings_errors(array($update_status)); } $word_checkbox = build_checkbox_array($bad_words_w_freq); echo_checkbox_selects(count($bad_words_w_freq)); $checkbox_form["projectid"] = $projectid;
function _handle_action($action, $list_type, $language, $cutoff, $lang_match) { $display_list = FALSE; switch ($action) { case "show": $word_freq = array(); $total_projects = 0; $total_projects_with_words = 0; // figure out what kind of language matching we're going to use $where_clause = ""; switch ($lang_match) { case "exact": $where_clause = "language = '{$language}'"; break; case "primary": $where_clause = "language like '{$language}%'"; break; case "any": $where_clause = "language like '%{$language}%'"; break; default: die("Unknown language match used: {$lang_match}"); } // loop through all projects that use $language $res = mysql_query("\n SELECT projectid\n FROM projects\n WHERE {$where_clause}\n "); while (list($projectid) = mysql_fetch_row($res)) { if ($list_type == "good") { $words = load_project_good_words($projectid); } elseif ($list_type == "bad") { $words = load_project_bad_words($projectid); } else { die("Unknown list type: {$list_type}"); } foreach ($words as $word) { @$word_freq[$word]++; } if (count($words)) { $total_projects_with_words++; } $total_projects++; } mysql_free_result($res); // sort the results arsort($word_freq); // show the results echo "<pre>"; echo _("Language") . ": {$language}<br>"; echo sprintf(_("Language match type: %s"), $lang_match) . "<br>"; echo sprintf(_("Word list type: %s"), $list_type) . "<br>"; echo sprintf(_("Cutoff percentage: %d%%"), $cutoff) . "<br>"; echo sprintf(_("Total projects matching language: %d"), $total_projects) . "<br>"; echo sprintf(_("Total projects with word lists: %d"), $total_projects_with_words) . "<br>"; echo "<br>"; echo _("Note: Percentages are calculated as frequency over the total number of projects with word lists.") . "<br>"; echo "<br>"; echo sprintf("%20s %5s %s<br>", _("Word"), _("Count"), _("Frequency")); foreach ($word_freq as $word => $freq) { $percentage = $freq / $total_projects_with_words * 100; if ($percentage < $cutoff) { break; } echo sprintf("%20s %5d (%-3.2f%%)<br>", $word, $freq, $percentage); } echo "</pre>"; break; case "list": $display_list = TRUE; break; default: die("Invalid action encountered."); } return $display_list; }
function merge_wordcheck_files($from_id, $to_id) { global $projects_dir; // good words $from_words = load_project_good_words($from_id); $to_words = load_project_good_words($to_id); $to_words = array_merge($to_words, $from_words); save_project_good_words($to_id, $to_words); // crying out for some abstraction here? // bad words $from_words = load_project_bad_words($from_id); $to_words = load_project_bad_words($to_id); $to_words = array_merge($to_words, $from_words); save_project_bad_words($to_id, $to_words); // suggestions // the file format is complicated and may change // so we take the sledgehammer approach, as suggested by cpeel... $from_path = "{$projects_dir}/{$from_id}/good_word_suggestions.txt"; if (!is_file($from_path)) { // The file does not exist. // Treat that the same as if it existed and was empty. $from_suggs = ""; } else { $from_suggs = file_get_contents($from_path); } $to_path = "{$projects_dir}/{$to_id}/good_word_suggestions.txt"; if (!is_file($to_path)) { // The file does not exist. // Treat that the same as if it existed and was empty. $to_suggs = ""; } else { $to_suggs = file_get_contents($to_path); } file_put_contents($to_path, $to_suggs . $from_suggs); // we're assuming the projects are in unavailable or waiting, so there // is going to be no need to put locks on the files or anything fancy }
function set_from_files($load_good_words = true, $load_bad_words = true) { $errors = array(); if ($load_good_words) { $gwl_object = get_project_word_file($this->projectid, "good"); $this->gwl_timestamp = $gwl_object->mod_time; $good_words = load_project_good_words($this->projectid); if (is_string($good_words)) { array_push($errors, $good_words); $this->good_words = ''; } else { $this->good_words = implode("\n", $good_words); } } if ($load_bad_words) { $bwl_object = get_project_word_file($this->projectid, "bad"); $this->bwl_timestamp = $bwl_object->mod_time; $bad_words = load_project_bad_words($this->projectid); if (is_string($bad_words)) { array_push($errors, $bad_words); $this->bad_words = ''; } else { $this->bad_words = implode("\n", $bad_words); } } return $errors; }