function _get_word_list($projectid, $timeCutoff)
{
    $messages = array();
    // load the suggestions
    $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff);
    if (!is_array($suggestions)) {
        $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions);
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    if (count($suggestions) == 0) {
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    // load project good words
    $project_good_words = load_project_good_words($projectid);
    // load project bad words
    $project_bad_words = load_project_bad_words($projectid);
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res));
    // array to hold all words
    $all_suggestions = array();
    $round_page_count = array();
    // parse the suggestions complex array
    // it is in the format: $suggestions[$round][$pagenum]=$wordsArray
    foreach ($suggestions as $round => $pageArray) {
        $round_suggestions = array();
        foreach ($pageArray as $page => $words) {
            // add the words to the per-round array
            $round_suggestions = array_merge($round_suggestions, $words);
            // add the words to the combined array too
            $all_suggestions = array_merge($all_suggestions, $words);
            @$round_page_count[$round]++;
        }
        // remove any words already on the project's good or bad words lists
        $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words));
        // get the suggestion occurrences
        $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions);
        // get suggestion with project word frequency
        $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions));
        // multisort screws up all-numeric words so we need to preprocess first
        prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]);
        // sort the list by frequency, then by word
        array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]);
    }
    // now, remove any words that are already on the project's good or bad words lists
    $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words));
    // get the number of suggestion occurrences
    $all_suggestions_w_occurrences = generate_frequencies($all_suggestions);
    // $all_suggestions doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $all_suggestions.
    $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions));
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($all_suggestions_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq);
    // get a list of all rounds
    $rounds = array_keys($round_suggestions_w_freq);
    return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages);
}
function _get_word_list($projectid)
{
    $messages = array();
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res));
    // load site word lists for project languages
    $site_possible_bad_words = load_site_possible_bad_words_given_project($projectid);
    // now, remove any words that are already on the project's bad word list
    $site_possible_bad_words = array_diff($site_possible_bad_words, load_project_bad_words($projectid));
    // $site_possible_bad_words doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $bad_words.
    $bad_words_w_freq = array_intersect_key($all_words_w_freq, array_flip($site_possible_bad_words));
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($bad_words_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($bad_words_w_freq), SORT_DESC, array_map('strtolower', array_keys($bad_words_w_freq)), SORT_ASC, $bad_words_w_freq);
    return array($bad_words_w_freq, $messages);
}
function _get_word_list($projectid)
{
    global $aspell_temp_dir;
    $ocr_filename = "{$aspell_temp_dir}/{$projectid}_ocr.txt";
    $latest_filename = "{$aspell_temp_dir}/{$projectid}_latest.txt";
    $messages = array();
    // get the OCR text
    // Note: If the code is changed to allow selecting of arbitrary round text
    //       instead of just the OCR round, edit_project_word_lists.php should
    //       be updated to allow this page to be accessed for those type-in
    //       projects with no OCR text.
    $pages_res = page_info_query($projectid, '[OCR]', 'LE');
    $all_page_text = get_page_texts($pages_res);
    // remove any formatting tags and add a final \r\n to each page-text
    // to ensure that there is whitespace between pages so they don't run together
    $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text);
    file_put_contents($ocr_filename, $all_page_text);
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_page_text = get_page_texts($pages_res);
    // remove any formatting tags and add a final \r\n to each page-text
    // to ensure that there is whitespace between pages so they don't run together
    $all_page_text = preg_replace(array('#<[/]?\\w+>#', '#$#'), array('', "\r\n"), $all_page_text);
    file_put_contents($latest_filename, $all_page_text);
    $all_words_w_freq = get_distinct_words_in_text($all_page_text);
    // clean up unused variables
    unset($all_page_text);
    // make external call to wdiff
    exec("wdiff -3 {$ocr_filename} {$latest_filename}", $wdiff_output, $return_code);
    // check to see if wdiff wasn't found to execute
    if ($return_code == 127) {
        die("Error invoking wdiff to do the diff analysis. Perhaps it is not installed.");
    }
    if ($return_code == 2) {
        die("Error reported from wdiff while attempting to do the diff analysis.");
    }
    // clean up the temporary files
    if (is_file($ocr_filename)) {
        unlink($ocr_filename);
    }
    if (is_file($latest_filename)) {
        unlink($latest_filename);
    }
    // specify the separator between the wdiff segments
    $separator = '======================================================================';
    $possible_scannos_w_correction = array();
    $possible_scannos_w_count = array();
    // parse the incoming data one segment at a time
    // from the original datastream to conserve memory
    $lineIndex = 0;
    $totalLines = count($wdiff_output);
    while ($lineIndex < $totalLines) {
        // pull the next segment
        $segment = "";
        while ($lineIndex <= $totalLines) {
            $line = $wdiff_output[$lineIndex];
            $lineIndex++;
            if ($line == $separator) {
                break;
            }
            $segment .= "{$line}\n";
        }
        // note that we're handling the case where two adjacent
        // words are updated
        $ocr_words = $latest_words = array();
        // pull out the original word(s)
        if (preg_match("/\\[-(.*?)-\\]/", $segment, $matches)) {
            $ocr_words = $matches[1];
            $ocr_words = get_all_words_in_text($ocr_words);
        }
        // if we don't have any ocr_words (probably because
        // the correction spanned lines) then don't bother
        // continuing with this segment
        if (!count($ocr_words)) {
            continue;
        }
        // pull out the replacement(s)
        if (preg_match("/{\\+(.*?)\\+}/", $segment, $matches)) {
            $latest_words = $matches[1];
            $latest_words = get_all_words_in_text($latest_words);
        }
        // if the number of words isn't the same between the two
        // bail since we don't handle that case yet
        if (count($ocr_words) != count($latest_words)) {
            continue;
        }
        // process the words, handles multi-words strings
        for ($index = 0; $index < count($ocr_words); $index++) {
            $ocr_word = $ocr_words[$index];
            $latest_word = $latest_words[$index];
            // if the words are the same or one of them empty, skip it
            if ($ocr_word == $latest_word || empty($ocr_word) || empty($latest_word)) {
                continue;
            }
            $possible_scannos_w_correction[$ocr_word] = $latest_word;
            @$possible_scannos_w_count[$ocr_word]++;
        }
    }
    // $wdiff_output can be very large
    // so unset it here to be nice for the rest of the function
    unset($wdiff_output);
    $possible_scannos = array_keys($possible_scannos_w_correction);
    // create a string of words to run through WordCheck
    $text_to_check = implode(" ", $possible_scannos);
    // run the list through WordCheck to see which it would flag
    list($possible_scannos_via_wordcheck, $languages, $messages) = get_bad_words_for_text($text_to_check, $projectid, 'all', '', array(), 'FREQS');
    // load site words
    $site_bad_words = load_site_bad_words_given_project($projectid);
    // load the project bad words
    $project_bad_words = load_project_bad_words($projectid);
    // remove words that WordCheck would flag
    $possible_scannos = array_diff($possible_scannos, array_keys($possible_scannos_via_wordcheck));
    // remove any scannos already on the site and project bad word lists
    $possible_scannos = array_diff($possible_scannos, $site_bad_words, $project_bad_words);
    // $possible_scannos doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $possible_scannos
    $possible_scannos_w_freq = array_intersect_key($all_words_w_freq, array_flip($possible_scannos));
    $percent_changed = array();
    foreach ($possible_scannos as $word) {
        $count = $possible_scannos_w_count[$word];
        $totalInstances = @$possible_scannos_w_freq[$word] + $count;
        $percent_changed[$word] = sprintf("%0.2f", $count / $totalInstances * 100);
        if ($percent_changed[$word] >= 100 && $totalInstances == 1) {
            unset($percent_changed[$word]);
        }
    }
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($percent_changed);
    // sort the list by frequency, then by word
    array_multisort(array_values($percent_changed), SORT_DESC, array_map('strtolower', array_keys($percent_changed)), SORT_ASC, $percent_changed);
    return array($percent_changed, $possible_scannos_w_freq, $messages, $possible_scannos_w_correction, $possible_scannos_w_count);
}
Esempio n. 4
0
 function save_to_db()
 {
     global $projects_dir, $pguser;
     $postednum_str = $this->postednum == "" ? "NULL" : "'{$this->postednum}'";
     // Call addslashes() on any members of $this that might contain
     // single-quotes/apostrophes (because they are unescaped, and
     // would otherwise break the query).
     $common_project_settings = "\n            t_last_edit    = UNIX_TIMESTAMP(),\n            up_projectid   = '{$this->up_projectid}',\n            nameofwork     = '" . addslashes($this->nameofwork) . "',\n            authorsname    = '" . addslashes($this->authorsname) . "',\n            language       = '{$this->language}',\n            genre          = '{$this->genre}',\n            difficulty     = '{$this->difficulty_level}',\n            special_code   = '{$this->special_code}',\n            clearance      = '" . addslashes($this->clearance) . "',\n            comments       = '" . addslashes($this->comments) . "',\n            image_source   = '{$this->image_source}',\n            scannercredit  = '" . addslashes($this->scannercredit) . "',\n            checkedoutby   = '{$this->checkedoutby}',\n            postednum      = {$postednum_str},\n            image_preparer = '{$this->image_preparer}',\n            text_preparer  = '{$this->text_preparer}',\n            extra_credits  = '" . addslashes($this->extra_credits) . "',\n            deletion_reason= '" . addslashes($this->deletion_reason) . "'\n        ";
     $pm_setter = '';
     if (user_is_a_sitemanager()) {
         // can change PM
         $pm_setter = " username = '******',";
     } else {
         if (isset($this->clone_projectid)) {
             // cloning a project. The PM should be the same as
             // that of the project being cloned, if the user
             // isn't an SA
             $res = mysql_query("\n                SELECT username\n                FROM projects\n                WHERE projectid='{$this->clone_projectid}'\n            ") or die(mysql_error());
             list($projectmanager) = mysql_fetch_row($res);
             $pm_setter = " username = '******',";
         }
     }
     if (isset($this->projectid)) {
         // We are updating an already-existing project.
         // needn't change $pm_setter, as there is no change if the user
         // isn't an SA
         // find out what we are changing from
         $old_pih = new ProjectInfoHolder();
         $fatal_error = $old_pih->set_from_db(TRUE, $this->projectid);
         if ($fatal_error != '') {
             $fatal_error = _('site error') . ': ' . $fatal_error;
             echo "<br><center><font size='+1' color='#ff0000'><b>{$fatal_error}</b></font></center>";
             exit;
         }
         $changed_fields = get_changed_fields($this, $old_pih);
         // We're particularly interested in knowing
         // when the project comments change.
         if (!in_array('comments', $changed_fields)) {
             // no change
             $tlcc_setter = '';
         } else {
             // changed!
             $tlcc_setter = 't_last_change_comments = UNIX_TIMESTAMP(),';
         }
         // We also want to know if the edit is resulting in the project
         // effectively being checked out to a new PPer
         if ($old_pih->state == PROJ_POST_FIRST_CHECKED_OUT && in_array('checkedoutby', $changed_fields)) {
             $md_setter = 'modifieddate = UNIX_TIMESTAMP(),';
             $PPer_checkout = TRUE;
         } else {
             $md_setter = '';
             $PPer_checkout = FALSE;
         }
         // Update the projects database with the updated info
         mysql_query("\n                UPDATE projects SET\n                    {$pm_setter}\n                    {$tlcc_setter}\n                    {$md_setter}\n                    {$common_project_settings}\n                WHERE projectid='{$this->projectid}'\n            ") or die(mysql_error());
         $details1 = implode(' ', $changed_fields);
         if ($details1 == '') {
             // There are no changed fields.
             // Don't just save '' for the details1 column,
             // because then do_history() won't be able to distinguish
             // this case (no changed fields) from old cases
             // (edit occurred before we started recording changed fields).
             // Instead, use a special value.
             $details1 = 'NONE';
         }
         $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'edit', $details1);
         if (!empty($e)) {
             die($e);
         }
         if ($PPer_checkout) {
             // we fake the project transition...
             $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'transition', PROJ_POST_FIRST_CHECKED_OUT, PROJ_POST_FIRST_CHECKED_OUT, $this->checkedoutby);
             if (!empty($e)) {
                 die($e);
             }
         }
         // Update the MARC record with any info we've received.
         $project = new Project($this->projectid);
         $marc_record = $project->load_marc_record();
         $this->update_marc_record_from_post($marc_record);
         $project->save_marc_record($marc_record);
     } else {
         // We are creating a new project
         $this->projectid = uniqid("projectID");
         // The project ID
         if ('' == $pm_setter) {
             $pm_setter = "username = '******',";
         }
         // Insert a new row into the projects table
         mysql_query("\n                INSERT INTO projects\n                SET\n                    projectid    = '{$this->projectid}',\n                    {$pm_setter}\n                    state        = '" . PROJ_NEW . "',\n                    modifieddate = UNIX_TIMESTAMP(),\n                    t_last_change_comments = UNIX_TIMESTAMP(),\n                    {$common_project_settings}\n            ") or die(mysql_error());
         $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'creation');
         if (!empty($e)) {
             die($e);
         }
         $e = project_allow_pages($this->projectid);
         if (!empty($e)) {
             die($e);
         }
         // Make a directory in the projects_dir for this project
         mkdir("{$projects_dir}/{$this->projectid}", 0777) or die("System error: unable to mkdir '{$projects_dir}/{$this->projectid}'");
         chmod("{$projects_dir}/{$this->projectid}", 0777);
         // Do MARC record manipulations
         $project = new Project($this->projectid);
         $marc_record = new MARCRecord();
         // Save original MARC record, if provided
         $yaz_array = unserialize(base64_decode($this->original_marc_array_encd));
         if ($yaz_array !== FALSE) {
             $marc_record->load_yaz_array($yaz_array);
             $project->init_marc_record($marc_record);
             // Update the MARC record with data from POST
             $this->update_marc_record_from_post($marc_record);
             $project->save_marc_record($marc_record);
         }
         // Create the project's 'good word list' and 'bad word list'.
         if (isset($this->clone_projectid)) {
             // We're creating a project via cloning.
             // Copy the original project's word-lists.
             $good_words = load_project_good_words($this->clone_projectid);
             if (is_string($good_words)) {
                 // It's an error message.
                 echo "{$good_words}<br>\n";
                 $good_words = array();
             }
             $bad_words = load_project_bad_words($this->clone_projectid);
             if (is_string($bad_words)) {
                 // It's an error message.
                 echo "{$bad_words}<br>\n";
                 $bad_words = array();
             }
         } else {
             // We're creating a project by means other than cloning
             // (from_nothing, from_marc_record, from_uberproject).
             // Initialize its GWL and BWL to empty.
             $good_words = array();
             $bad_words = array();
         }
         save_project_good_words($this->projectid, $good_words);
         save_project_bad_words($this->projectid, $bad_words);
     }
     // Create/update the Dublin Core file for the project.
     // When we get here, the project's database entry has been fully
     // updated, so we can create a Project object and allow it
     // to pull the relevant fields from the database.
     $project = new Project($this->projectid);
     $project->create_dc_xml_oai($marc_record);
     // If the project has been posted to PG, make the appropriate transition.
     if ($this->posted) {
         $err = project_transition($this->projectid, PROJ_SUBMIT_PG_POSTED, $pguser);
         if ($err != '') {
             echo "{$err}<br>\n";
             exit;
         }
     }
 }
$cutoffOptions = array(1, 2, 3, 4, 5, 10, 25, 50);
// what is the initial cutoff frequency?
$initialFreq = getInitialCutoff($freqCutoff, $cutoffOptions, $bad_words_w_freq);
// echo page support text, like JS and stylesheets
echo_cutoff_script($cutoffOptions, $instances);
echo_word_freq_style();
echo "<p>{$page_text}</p>";
echo_page_instruction_text("good", $format);
echo_download_text($projectid, $format);
echo_any_warnings_errors($messages);
echo_cutoff_text($initialFreq, $cutoffOptions);
$context_array = build_context_array_links($bad_words_w_freq, $projectid);
// load site bad words
$site_bad_words = load_site_bad_words_given_project($projectid);
// load project bad words
$project_bad_words = load_project_bad_words($projectid);
$word_notes = array();
foreach ($site_bad_words as $word) {
    $word_notes[$word] = _("On site BWL");
}
foreach ($project_bad_words as $word) {
    $word_notes[$word] = _("On project BWL");
}
$context_array["[[TITLE]]"] = _("Show Context");
$word_notes["[[TITLE]]"] = _("Notes");
if (isset($update_status) && $update_status !== "Success") {
    echo_any_warnings_errors(array($update_status));
}
$word_checkbox = build_checkbox_array($bad_words_w_freq);
echo_checkbox_selects(count($bad_words_w_freq));
$checkbox_form["projectid"] = $projectid;
function _handle_action($action, $list_type, $language, $cutoff, $lang_match)
{
    $display_list = FALSE;
    switch ($action) {
        case "show":
            $word_freq = array();
            $total_projects = 0;
            $total_projects_with_words = 0;
            // figure out what kind of language matching we're going to use
            $where_clause = "";
            switch ($lang_match) {
                case "exact":
                    $where_clause = "language = '{$language}'";
                    break;
                case "primary":
                    $where_clause = "language like '{$language}%'";
                    break;
                case "any":
                    $where_clause = "language like '%{$language}%'";
                    break;
                default:
                    die("Unknown language match used: {$lang_match}");
            }
            // loop through all projects that use $language
            $res = mysql_query("\n                SELECT projectid\n                FROM projects\n                WHERE {$where_clause}\n            ");
            while (list($projectid) = mysql_fetch_row($res)) {
                if ($list_type == "good") {
                    $words = load_project_good_words($projectid);
                } elseif ($list_type == "bad") {
                    $words = load_project_bad_words($projectid);
                } else {
                    die("Unknown list type: {$list_type}");
                }
                foreach ($words as $word) {
                    @$word_freq[$word]++;
                }
                if (count($words)) {
                    $total_projects_with_words++;
                }
                $total_projects++;
            }
            mysql_free_result($res);
            // sort the results
            arsort($word_freq);
            // show the results
            echo "<pre>";
            echo _("Language") . ": {$language}<br>";
            echo sprintf(_("Language match type: %s"), $lang_match) . "<br>";
            echo sprintf(_("Word list type: %s"), $list_type) . "<br>";
            echo sprintf(_("Cutoff percentage: %d%%"), $cutoff) . "<br>";
            echo sprintf(_("Total projects matching language: %d"), $total_projects) . "<br>";
            echo sprintf(_("Total projects with word lists: %d"), $total_projects_with_words) . "<br>";
            echo "<br>";
            echo _("Note: Percentages are calculated as frequency over the total number of projects with word lists.") . "<br>";
            echo "<br>";
            echo sprintf("%20s  %5s  %s<br>", _("Word"), _("Count"), _("Frequency"));
            foreach ($word_freq as $word => $freq) {
                $percentage = $freq / $total_projects_with_words * 100;
                if ($percentage < $cutoff) {
                    break;
                }
                echo sprintf("%20s  %5d  (%-3.2f%%)<br>", $word, $freq, $percentage);
            }
            echo "</pre>";
            break;
        case "list":
            $display_list = TRUE;
            break;
        default:
            die("Invalid action encountered.");
    }
    return $display_list;
}
Esempio n. 7
0
function merge_wordcheck_files($from_id, $to_id)
{
    global $projects_dir;
    // good words
    $from_words = load_project_good_words($from_id);
    $to_words = load_project_good_words($to_id);
    $to_words = array_merge($to_words, $from_words);
    save_project_good_words($to_id, $to_words);
    // crying out for some abstraction here?
    // bad words
    $from_words = load_project_bad_words($from_id);
    $to_words = load_project_bad_words($to_id);
    $to_words = array_merge($to_words, $from_words);
    save_project_bad_words($to_id, $to_words);
    // suggestions
    // the file format is complicated and may change
    // so we take the sledgehammer approach, as suggested by cpeel...
    $from_path = "{$projects_dir}/{$from_id}/good_word_suggestions.txt";
    if (!is_file($from_path)) {
        // The file does not exist.
        // Treat that the same as if it existed and was empty.
        $from_suggs = "";
    } else {
        $from_suggs = file_get_contents($from_path);
    }
    $to_path = "{$projects_dir}/{$to_id}/good_word_suggestions.txt";
    if (!is_file($to_path)) {
        // The file does not exist.
        // Treat that the same as if it existed and was empty.
        $to_suggs = "";
    } else {
        $to_suggs = file_get_contents($to_path);
    }
    file_put_contents($to_path, $to_suggs . $from_suggs);
    // we're assuming the projects are in unavailable or waiting, so there
    // is going to be no need to put locks on the files or anything fancy
}
 function set_from_files($load_good_words = true, $load_bad_words = true)
 {
     $errors = array();
     if ($load_good_words) {
         $gwl_object = get_project_word_file($this->projectid, "good");
         $this->gwl_timestamp = $gwl_object->mod_time;
         $good_words = load_project_good_words($this->projectid);
         if (is_string($good_words)) {
             array_push($errors, $good_words);
             $this->good_words = '';
         } else {
             $this->good_words = implode("\n", $good_words);
         }
     }
     if ($load_bad_words) {
         $bwl_object = get_project_word_file($this->projectid, "bad");
         $this->bwl_timestamp = $bwl_object->mod_time;
         $bad_words = load_project_bad_words($this->projectid);
         if (is_string($bad_words)) {
             array_push($errors, $bad_words);
             $this->bad_words = '';
         } else {
             $this->bad_words = implode("\n", $bad_words);
         }
     }
     return $errors;
 }