function _get_word_list($projectid, $timeCutoff) { $messages = array(); // load the suggestions $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff); if (!is_array($suggestions)) { $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions); return array(array(), array(), array(), array(), array(), array(), $messages); } if (count($suggestions) == 0) { return array(array(), array(), array(), array(), array(), array(), $messages); } // load project good words $project_good_words = load_project_good_words($projectid); // load project bad words $project_bad_words = load_project_bad_words($projectid); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res)); // array to hold all words $all_suggestions = array(); $round_page_count = array(); // parse the suggestions complex array // it is in the format: $suggestions[$round][$pagenum]=$wordsArray foreach ($suggestions as $round => $pageArray) { $round_suggestions = array(); foreach ($pageArray as $page => $words) { // add the words to the per-round array $round_suggestions = array_merge($round_suggestions, $words); // add the words to the combined array too $all_suggestions = array_merge($all_suggestions, $words); @$round_page_count[$round]++; } // remove any words already on the project's good or bad words lists $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words)); // get the suggestion occurrences $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions); // get suggestion with project word frequency $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]); // sort the list by frequency, then by word array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]); } // now, remove any words that are already on the project's good or bad words lists $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words)); // get the number of suggestion occurrences $all_suggestions_w_occurrences = generate_frequencies($all_suggestions); // $all_suggestions doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $all_suggestions. $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($all_suggestions_w_freq); // sort the list by frequency, then by word array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq); // get a list of all rounds $rounds = array_keys($round_suggestions_w_freq); return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages); }
function _get_word_list($projectid) { global $aspell_temp_dir; $ocr_filename = "{$aspell_temp_dir}/{$projectid}_ocr.txt"; $latest_filename = "{$aspell_temp_dir}/{$projectid}_latest.txt"; $messages = array(); // get the OCR text // Note: If the code is changed to allow selecting of arbitrary round text // instead of just the OCR round, edit_project_word_lists.php should // be updated to allow this page to be accessed for those type-in // projects with no OCR text. $pages_res = page_info_query($projectid, '[OCR]', 'LE'); $all_page_text = get_page_texts($pages_res); // remove any formatting tags and add a final \r\n to each page-text // to ensure that there is whitespace between pages so they don't run together $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text); file_put_contents($ocr_filename, $all_page_text); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_page_text = get_page_texts($pages_res); // remove any formatting tags and add a final \r\n to each page-text // to ensure that there is whitespace between pages so they don't run together $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text); file_put_contents($latest_filename, $all_page_text); $all_words_w_freq = get_distinct_words_in_text($all_page_text); // clean up unused variables unset($all_page_text); // make external call to wdiff exec("wdiff -3 {$ocr_filename} {$latest_filename}", $wdiff_output, $return_code); // check to see if wdiff wasn't found to execute if ($return_code == 127) { die("Error invoking wdiff to do the diff analysis. Perhaps it is not installed."); } if ($return_code == 2) { die("Error reported from wdiff while attempting to do the diff analysis."); } // clean up the temporary files if (is_file($ocr_filename)) { unlink($ocr_filename); } if (is_file($latest_filename)) { unlink($latest_filename); } // specify the separator between the wdiff segments $separator = '======================================================================'; $possible_scannos_w_correction = array(); $possible_scannos_w_count = array(); // parse the incoming data one segment at a time // from the original datastream to conserve memory $lineIndex = 0; $totalLines = count($wdiff_output); while ($lineIndex < $totalLines) { // pull the next segment $segment = ""; while ($lineIndex <= $totalLines) { $line = $wdiff_output[$lineIndex]; $lineIndex++; if ($line == $separator) { break; } $segment .= "{$line}\n"; } // note that we're handling the case where two adjacent // words are updated $ocr_words = $latest_words = array(); // pull out the original word(s) if (preg_match("/\\[-(.*?)-\\]/", $segment, $matches)) { $ocr_words = $matches[1]; $ocr_words = get_all_words_in_text($ocr_words); } // if we don't have any ocr_words (probably because // the correction spanned lines) then don't bother // continuing with this segment if (!count($ocr_words)) { continue; } // pull out the replacement(s) if (preg_match("/{\\+(.*?)\\+}/", $segment, $matches)) { $latest_words = $matches[1]; $latest_words = get_all_words_in_text($latest_words); } // if the number of words isn't the same between the two // bail since we don't handle that case yet if (count($ocr_words) != count($latest_words)) { continue; } // process the words, handles multi-words strings for ($index = 0; $index < count($ocr_words); $index++) { $ocr_word = $ocr_words[$index]; $latest_word = $latest_words[$index]; // if the words are the same or one of them empty, skip it if ($ocr_word == $latest_word || empty($ocr_word) || empty($latest_word)) { continue; } $possible_scannos_w_correction[$ocr_word] = $latest_word; @$possible_scannos_w_count[$ocr_word]++; } } // $wdiff_output can be very large // so unset it here to be nice for the rest of the function unset($wdiff_output); $possible_scannos = array_keys($possible_scannos_w_correction); // create a string of words to run through WordCheck $text_to_check = implode(" ", $possible_scannos); // run the list through WordCheck to see which it would flag list($possible_scannos_via_wordcheck, $languages, $messages) = get_bad_words_for_text($text_to_check, $projectid, 'all', '', array(), 'FREQS'); // load site words $site_bad_words = load_site_bad_words_given_project($projectid); // load the project bad words $project_bad_words = load_project_bad_words($projectid); // remove words that WordCheck would flag $possible_scannos = array_diff($possible_scannos, array_keys($possible_scannos_via_wordcheck)); // remove any scannos already on the site and project bad word lists $possible_scannos = array_diff($possible_scannos, $site_bad_words, $project_bad_words); // $possible_scannos doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $possible_scannos $possible_scannos_w_freq = array_intersect_key($all_words_w_freq, array_flip($possible_scannos)); $percent_changed = array(); foreach ($possible_scannos as $word) { $count = $possible_scannos_w_count[$word]; $totalInstances = @$possible_scannos_w_freq[$word] + $count; $percent_changed[$word] = sprintf("%0.2f", $count / $totalInstances * 100); if ($percent_changed[$word] >= 100 && $totalInstances == 1) { unset($percent_changed[$word]); } } // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($percent_changed); // sort the list by frequency, then by word array_multisort(array_values($percent_changed), SORT_DESC, array_map('strtolower', array_keys($percent_changed)), SORT_ASC, $percent_changed); return array($percent_changed, $possible_scannos_w_freq, $messages, $possible_scannos_w_correction, $possible_scannos_w_count); }
function _get_word_list($projectid) { $messages = array(); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res)); // load site word lists for project languages $site_possible_bad_words = load_site_possible_bad_words_given_project($projectid); // now, remove any words that are already on the project's bad word list $site_possible_bad_words = array_diff($site_possible_bad_words, load_project_bad_words($projectid)); // $site_possible_bad_words doesn't have frequency info, // so start with the info in $all_words_w_freq, // and extract the items where the key matches a key in $bad_words. $bad_words_w_freq = array_intersect_key($all_words_w_freq, array_flip($site_possible_bad_words)); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($bad_words_w_freq); // sort the list by frequency, then by word array_multisort(array_values($bad_words_w_freq), SORT_DESC, array_map('strtolower', array_keys($bad_words_w_freq)), SORT_ASC, $bad_words_w_freq); return array($bad_words_w_freq, $messages); }
function _get_word_list($projectid) { $messages = array(); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $page_texts = get_page_texts($pages_res); // now run it through WordCheck list($bad_words_w_freq, $languages, $messages) = get_bad_words_for_text($page_texts, $projectid, 'all', '', array(), 'FREQS'); // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($bad_words_w_freq); // sort the list by frequency, then by word array_multisort(array_values($bad_words_w_freq), SORT_DESC, array_map('strtolower', array_keys($bad_words_w_freq)), SORT_ASC, $bad_words_w_freq); return array($bad_words_w_freq, $messages); }
function _get_word_list($projectid, $queryWords) { $messages = array(); // get the latest project text of all pages up to last possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); $page_texts = get_page_texts($pages_res); // now run it through WordCheck $all_words_w_freq = get_distinct_words_in_text($page_texts); $words_w_freq = array(); foreach ($queryWords as $word) { if (@$all_words_w_freq[$word]) { $words_w_freq[$word] = $all_words_w_freq[$word]; } } // multisort screws up all-numeric words so we need to preprocess first prep_numeric_keys_for_multisort($words_w_freq); // sort the list by frequency, then by word array_multisort(array_values($words_w_freq), SORT_DESC, array_map('strtolower', array_keys($words_w_freq)), SORT_ASC, $words_w_freq); return array($words_w_freq, $messages); }
$project_name = get_project_name($projectid); // TRANSLATORS: %1$s is a word, %2$s is the project name. echo "<h2>", sprintf(_("Context for '%1\$s' in %2\$s"), $word, $project_name), "</h2>"; echo_word_freq_style(); echo "<p>"; echo "<a target='_PARENT' href='" . attr_safe($_SERVER['PHP_SELF']) . "?projectid={$projectid}&word={$encWord}&wordInstances={$wordInstances}&"; if ($layout == LAYOUT_HORIZ) { echo "layout=" . LAYOUT_VERT . "'>" . _("Change to vertical layout"); } else { echo "layout=" . LAYOUT_HORIZ . "'>" . _("Change to horizontal layout"); } echo "</a>"; echo "</p>"; // get the latest possible round $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS); $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE'); // iterate through all the pages until we find $wordInstances of the word // we're looking for $foundInstances = 0; while (list($page_text, $page, $proofer_names) = page_info_fetch($pages_res)) { // get a context string list($context_strings, $totalLines) = _get_word_context_from_text($page_text, $word); if (!count($context_strings)) { continue; } echo "<p>"; echo "<b>" . _("Page") . "</b>: <a href='displayimage.php?project={$projectid}&imagefile={$page}&showreturnlink=0' target='imageframe'>{$page}</a><br>"; foreach ($context_strings as $lineNum => $context_string) { $context_string = _highlight_word(html_safe($context_string, ENT_NOQUOTES), $word); echo "<b>", _("Line"), "</b>: ", sprintf(_('~%1$d of %2$d'), $lineNum, $totalLines), " | "; echo "<b>" . _("Context") . "</b>:<br><span class='mono'>{$context_string}</span><br>";