get_distinct_words_in_text PHP Code Examples

Example #1

0

Show file

File: show_good_word_suggestions.php Project: cpeel/dproofreaders-shadow

function _get_word_list($projectid, $timeCutoff)
{
    $messages = array();
    // load the suggestions
    $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff);
    if (!is_array($suggestions)) {
        $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions);
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    if (count($suggestions) == 0) {
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    // load project good words
    $project_good_words = load_project_good_words($projectid);
    // load project bad words
    $project_bad_words = load_project_bad_words($projectid);
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res));
    // array to hold all words
    $all_suggestions = array();
    $round_page_count = array();
    // parse the suggestions complex array
    // it is in the format: $suggestions[$round][$pagenum]=$wordsArray
    foreach ($suggestions as $round => $pageArray) {
        $round_suggestions = array();
        foreach ($pageArray as $page => $words) {
            // add the words to the per-round array
            $round_suggestions = array_merge($round_suggestions, $words);
            // add the words to the combined array too
            $all_suggestions = array_merge($all_suggestions, $words);
            @$round_page_count[$round]++;
        }
        // remove any words already on the project's good or bad words lists
        $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words));
        // get the suggestion occurrences
        $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions);
        // get suggestion with project word frequency
        $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions));
        // multisort screws up all-numeric words so we need to preprocess first
        prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]);
        // sort the list by frequency, then by word
        array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]);
    }
    // now, remove any words that are already on the project's good or bad words lists
    $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words));
    // get the number of suggestion occurrences
    $all_suggestions_w_occurrences = generate_frequencies($all_suggestions);
    // $all_suggestions doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $all_suggestions.
    $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions));
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($all_suggestions_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq);
    // get a list of all rounds
    $rounds = array_keys($round_suggestions_w_freq);
    return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages);
}

Example #2

0

Show file

File: show_project_stealth_scannos.php Project: cpeel/dproofreaders-shadow

function _get_word_list($projectid)
{
    global $aspell_temp_dir;
    $ocr_filename = "{$aspell_temp_dir}/{$projectid}_ocr.txt";
    $latest_filename = "{$aspell_temp_dir}/{$projectid}_latest.txt";
    $messages = array();
    // get the OCR text
    // Note: If the code is changed to allow selecting of arbitrary round text
    //       instead of just the OCR round, edit_project_word_lists.php should
    //       be updated to allow this page to be accessed for those type-in
    //       projects with no OCR text.
    $pages_res = page_info_query($projectid, '[OCR]', 'LE');
    $all_page_text = get_page_texts($pages_res);
    // remove any formatting tags and add a final \r\n to each page-text
    // to ensure that there is whitespace between pages so they don't run together
    $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text);
    file_put_contents($ocr_filename, $all_page_text);
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_page_text = get_page_texts($pages_res);
    // remove any formatting tags and add a final \r\n to each page-text
    // to ensure that there is whitespace between pages so they don't run together
    $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text);
    file_put_contents($latest_filename, $all_page_text);
    $all_words_w_freq = get_distinct_words_in_text($all_page_text);
    // clean up unused variables
    unset($all_page_text);
    // make external call to wdiff
    exec("wdiff -3 {$ocr_filename} {$latest_filename}", $wdiff_output, $return_code);
    // check to see if wdiff wasn't found to execute
    if ($return_code == 127) {
        die("Error invoking wdiff to do the diff analysis. Perhaps it is not installed.");
    }
    if ($return_code == 2) {
        die("Error reported from wdiff while attempting to do the diff analysis.");
    }
    // clean up the temporary files
    if (is_file($ocr_filename)) {
        unlink($ocr_filename);
    }
    if (is_file($latest_filename)) {
        unlink($latest_filename);
    }
    // specify the separator between the wdiff segments
    $separator = '======================================================================';
    $possible_scannos_w_correction = array();
    $possible_scannos_w_count = array();
    // parse the incoming data one segment at a time
    // from the original datastream to conserve memory
    $lineIndex = 0;
    $totalLines = count($wdiff_output);
    while ($lineIndex < $totalLines) {
        // pull the next segment
        $segment = "";
        while ($lineIndex <= $totalLines) {
            $line = $wdiff_output[$lineIndex];
            $lineIndex++;
            if ($line == $separator) {
                break;
            }
            $segment .= "{$line}\n";
        }
        // note that we're handling the case where two adjacent
        // words are updated
        $ocr_words = $latest_words = array();
        // pull out the original word(s)
        if (preg_match("/\\[-(.*?)-\\]/", $segment, $matches)) {
            $ocr_words = $matches[1];
            $ocr_words = get_all_words_in_text($ocr_words);
        }
        // if we don't have any ocr_words (probably because
        // the correction spanned lines) then don't bother
        // continuing with this segment
        if (!count($ocr_words)) {
            continue;
        }
        // pull out the replacement(s)
        if (preg_match("/{\\+(.*?)\\+}/", $segment, $matches)) {
            $latest_words = $matches[1];
            $latest_words = get_all_words_in_text($latest_words);
        }
        // if the number of words isn't the same between the two
        // bail since we don't handle that case yet
        if (count($ocr_words) != count($latest_words)) {
            continue;
        }
        // process the words, handles multi-words strings
        for ($index = 0; $index < count($ocr_words); $index++) {
            $ocr_word = $ocr_words[$index];
            $latest_word = $latest_words[$index];
            // if the words are the same or one of them empty, skip it
            if ($ocr_word == $latest_word || empty($ocr_word) || empty($latest_word)) {
                continue;
            }
            $possible_scannos_w_correction[$ocr_word] = $latest_word;
            @$possible_scannos_w_count[$ocr_word]++;
        }
    }
    // $wdiff_output can be very large
    // so unset it here to be nice for the rest of the function
    unset($wdiff_output);
    $possible_scannos = array_keys($possible_scannos_w_correction);
    // create a string of words to run through WordCheck
    $text_to_check = implode(" ", $possible_scannos);
    // run the list through WordCheck to see which it would flag
    list($possible_scannos_via_wordcheck, $languages, $messages) = get_bad_words_for_text($text_to_check, $projectid, 'all', '', array(), 'FREQS');
    // load site words
    $site_bad_words = load_site_bad_words_given_project($projectid);
    // load the project bad words
    $project_bad_words = load_project_bad_words($projectid);
    // remove words that WordCheck would flag
    $possible_scannos = array_diff($possible_scannos, array_keys($possible_scannos_via_wordcheck));
    // remove any scannos already on the site and project bad word lists
    $possible_scannos = array_diff($possible_scannos, $site_bad_words, $project_bad_words);
    // $possible_scannos doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $possible_scannos
    $possible_scannos_w_freq = array_intersect_key($all_words_w_freq, array_flip($possible_scannos));
    $percent_changed = array();
    foreach ($possible_scannos as $word) {
        $count = $possible_scannos_w_count[$word];
        $totalInstances = @$possible_scannos_w_freq[$word] + $count;
        $percent_changed[$word] = sprintf("%0.2f", $count / $totalInstances * 100);
        if ($percent_changed[$word] >= 100 && $totalInstances == 1) {
            unset($percent_changed[$word]);
        }
    }
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($percent_changed);
    // sort the list by frequency, then by word
    array_multisort(array_values($percent_changed), SORT_DESC, array_map('strtolower', array_keys($percent_changed)), SORT_ASC, $percent_changed);
    return array($percent_changed, $possible_scannos_w_freq, $messages, $possible_scannos_w_correction, $possible_scannos_w_count);
}

Example #3

0

Show file

File: show_adhoc_word_details.php Project: cpeel/dproofreaders-shadow

function _get_word_list($projectid, $queryWords)
{
    $messages = array();
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $page_texts = get_page_texts($pages_res);
    // now run it through WordCheck
    $all_words_w_freq = get_distinct_words_in_text($page_texts);
    $words_w_freq = array();
    foreach ($queryWords as $word) {
        if (@$all_words_w_freq[$word]) {
            $words_w_freq[$word] = $all_words_w_freq[$word];
        }
    }
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($words_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($words_w_freq), SORT_DESC, array_map('strtolower', array_keys($words_w_freq)), SORT_ASC, $words_w_freq);
    return array($words_w_freq, $messages);
}

Example #4

0

Show file

File: show_project_possible_bad_words.php Project: cpeel/dproofreaders-shadow

function _get_word_list($projectid)
{
    $messages = array();
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res));
    // load site word lists for project languages
    $site_possible_bad_words = load_site_possible_bad_words_given_project($projectid);
    // now, remove any words that are already on the project's bad word list
    $site_possible_bad_words = array_diff($site_possible_bad_words, load_project_bad_words($projectid));
    // $site_possible_bad_words doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $bad_words.
    $bad_words_w_freq = array_intersect_key($all_words_w_freq, array_flip($site_possible_bad_words));
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($bad_words_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($bad_words_w_freq), SORT_DESC, array_map('strtolower', array_keys($bad_words_w_freq)), SORT_ASC, $bad_words_w_freq);
    return array($bad_words_w_freq, $messages);
}

Example #5

0

Show file

File: show_project_wordcheck_stats.php Project: cpeel/dproofreaders-shadow

    // is this word in the site's Bad list?
    if (in_array($word, $site_bad_words)) {
        $total["site_bad_words"] += $freq;
    }
    // add total flagged words
    $total["flagged"] += $freq;
}
$total["num_pages"] = $project->n_pages;
// now run it again except we're going to count the words per page
// this time through
$pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
$page_stats = array();
// iterate through all the pages gathering stats
while (list($page_text, $page, $proofer_names) = page_info_fetch($pages_res)) {
    // find which words would be flagged for this page
    $page_words_w_freq = get_distinct_words_in_text($page_text);
    $page_stats[$page]["flagged"] = 0;
    // cycle through the words and count things
    foreach ($page_words_w_freq as $word => $freq) {
        // is this word flagged?
        if (isset($bad_words_w_freq[$word])) {
            $page_stats[$page]["flagged"] += $freq;
        }
    }
}
mysql_free_result($pages_res);
$total["flagged_min"] = 1000000;
$total["flagged_max"] = 0;
$mode = array();
$graph_x = array();
$graph_y = array();

PHP get_distinct_words_in_text Examples