function _get_word_list($projectid, $timeCutoff)
{
    $messages = array();
    // load the suggestions
    $suggestions = load_project_good_word_suggestions($projectid, $timeCutoff);
    if (!is_array($suggestions)) {
        $messages[] = sprintf(_("Unable to load suggestions: %s"), $suggestions);
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    if (count($suggestions) == 0) {
        return array(array(), array(), array(), array(), array(), array(), $messages);
    }
    // load project good words
    $project_good_words = load_project_good_words($projectid);
    // load project bad words
    $project_bad_words = load_project_bad_words($projectid);
    // get the latest project text of all pages up to last possible round
    $last_possible_round = get_Round_for_round_number(MAX_NUM_PAGE_EDITING_ROUNDS);
    $pages_res = page_info_query($projectid, $last_possible_round->id, 'LE');
    $all_words_w_freq = get_distinct_words_in_text(get_page_texts($pages_res));
    // array to hold all words
    $all_suggestions = array();
    $round_page_count = array();
    // parse the suggestions complex array
    // it is in the format: $suggestions[$round][$pagenum]=$wordsArray
    foreach ($suggestions as $round => $pageArray) {
        $round_suggestions = array();
        foreach ($pageArray as $page => $words) {
            // add the words to the per-round array
            $round_suggestions = array_merge($round_suggestions, $words);
            // add the words to the combined array too
            $all_suggestions = array_merge($all_suggestions, $words);
            @$round_page_count[$round]++;
        }
        // remove any words already on the project's good or bad words lists
        $round_suggestions = array_diff($round_suggestions, array_merge($project_good_words, $project_bad_words));
        // get the suggestion occurrences
        $round_suggestions_w_occurrences[$round] = generate_frequencies($round_suggestions);
        // get suggestion with project word frequency
        $round_suggestions_w_freq[$round] = array_intersect_key($all_words_w_freq, array_flip($round_suggestions));
        // multisort screws up all-numeric words so we need to preprocess first
        prep_numeric_keys_for_multisort($round_suggestions_w_freq[$round]);
        // sort the list by frequency, then by word
        array_multisort(array_values($round_suggestions_w_freq[$round]), SORT_DESC, array_map('strtolower', array_keys($round_suggestions_w_freq[$round])), SORT_ASC, $round_suggestions_w_freq[$round]);
    }
    // now, remove any words that are already on the project's good or bad words lists
    $all_suggestions = array_diff($all_suggestions, array_merge($project_good_words, $project_bad_words));
    // get the number of suggestion occurrences
    $all_suggestions_w_occurrences = generate_frequencies($all_suggestions);
    // $all_suggestions doesn't have frequency info,
    // so start with the info in $all_words_w_freq,
    // and extract the items where the key matches a key in $all_suggestions.
    $all_suggestions_w_freq = array_intersect_key($all_words_w_freq, array_flip($all_suggestions));
    // multisort screws up all-numeric words so we need to preprocess first
    prep_numeric_keys_for_multisort($all_suggestions_w_freq);
    // sort the list by frequency, then by word
    array_multisort(array_values($all_suggestions_w_freq), SORT_DESC, array_map('strtolower', array_keys($all_suggestions_w_freq)), SORT_ASC, $all_suggestions_w_freq);
    // get a list of all rounds
    $rounds = array_keys($round_suggestions_w_freq);
    return array($all_suggestions_w_freq, $all_suggestions_w_occurrences, $round_suggestions_w_freq, $round_suggestions_w_occurrences, $rounds, $round_page_count, $messages);
}
$instances = 1;
// what are the cutoff options?
$cutoffOptions = array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90);
// what is the initial cutoff frequency?
$initialFreq = getInitialCutoff(50, $cutoffOptions, $percent_changed);
// echo page support text, like JS and stylesheets
echo_cutoff_script($cutoffOptions, $instances);
echo_word_freq_style();
echo "<p>{$page_text}</p>";
echo_page_instruction_text("bad", $format);
echo_any_warnings_errors($messages);
echo_download_text($projectid, $format);
// output customized cutoff text
$cutoff_text = sprintf(_("Words with fewer than <b><span id='current_cutoff'>%d</span>%%</b> of the instances changed are not shown. Other cutoff options are available: %s"), $initialFreq, get_cutoff_string($cutoffOptions, "%"));
echo "<p>{$cutoff_text}</p>\n";
$project_good_words = load_project_good_words($projectid);
$word_checkbox = build_checkbox_array($percent_changed);
$context_array = build_context_array_links($instances_left, $projectid);
// build the word_note and the instances_total arrays
$word_notes = array();
$instances_total = array();
foreach ($instances_left as $word => $freq) {
    if (in_array($word, $project_good_words)) {
        $word_notes[$word] = _("On project GWL");
    }
    $instances_total[$word] = $instances_changed[$word] + $instances_left[$word];
}
$word_notes["[[TITLE]]"] = _("Notes");
$percent_changed["[[TITLE]]"] = _("% Changed");
$percent_changed["[[STYLE]]"] = "text-align: right;";
$instances_changed_to["[[TITLE]]"] = _("Last changed to");
예제 #3
0
 function save_to_db()
 {
     global $projects_dir, $pguser;
     $postednum_str = $this->postednum == "" ? "NULL" : "'{$this->postednum}'";
     // Call addslashes() on any members of $this that might contain
     // single-quotes/apostrophes (because they are unescaped, and
     // would otherwise break the query).
     $common_project_settings = "\n            t_last_edit    = UNIX_TIMESTAMP(),\n            up_projectid   = '{$this->up_projectid}',\n            nameofwork     = '" . addslashes($this->nameofwork) . "',\n            authorsname    = '" . addslashes($this->authorsname) . "',\n            language       = '{$this->language}',\n            genre          = '{$this->genre}',\n            difficulty     = '{$this->difficulty_level}',\n            special_code   = '{$this->special_code}',\n            clearance      = '" . addslashes($this->clearance) . "',\n            comments       = '" . addslashes($this->comments) . "',\n            image_source   = '{$this->image_source}',\n            scannercredit  = '" . addslashes($this->scannercredit) . "',\n            checkedoutby   = '{$this->checkedoutby}',\n            postednum      = {$postednum_str},\n            image_preparer = '{$this->image_preparer}',\n            text_preparer  = '{$this->text_preparer}',\n            extra_credits  = '" . addslashes($this->extra_credits) . "',\n            deletion_reason= '" . addslashes($this->deletion_reason) . "'\n        ";
     $pm_setter = '';
     if (user_is_a_sitemanager()) {
         // can change PM
         $pm_setter = " username = '******',";
     } else {
         if (isset($this->clone_projectid)) {
             // cloning a project. The PM should be the same as
             // that of the project being cloned, if the user
             // isn't an SA
             $res = mysql_query("\n                SELECT username\n                FROM projects\n                WHERE projectid='{$this->clone_projectid}'\n            ") or die(mysql_error());
             list($projectmanager) = mysql_fetch_row($res);
             $pm_setter = " username = '******',";
         }
     }
     if (isset($this->projectid)) {
         // We are updating an already-existing project.
         // needn't change $pm_setter, as there is no change if the user
         // isn't an SA
         // find out what we are changing from
         $old_pih = new ProjectInfoHolder();
         $fatal_error = $old_pih->set_from_db(TRUE, $this->projectid);
         if ($fatal_error != '') {
             $fatal_error = _('site error') . ': ' . $fatal_error;
             echo "<br><center><font size='+1' color='#ff0000'><b>{$fatal_error}</b></font></center>";
             exit;
         }
         $changed_fields = get_changed_fields($this, $old_pih);
         // We're particularly interested in knowing
         // when the project comments change.
         if (!in_array('comments', $changed_fields)) {
             // no change
             $tlcc_setter = '';
         } else {
             // changed!
             $tlcc_setter = 't_last_change_comments = UNIX_TIMESTAMP(),';
         }
         // We also want to know if the edit is resulting in the project
         // effectively being checked out to a new PPer
         if ($old_pih->state == PROJ_POST_FIRST_CHECKED_OUT && in_array('checkedoutby', $changed_fields)) {
             $md_setter = 'modifieddate = UNIX_TIMESTAMP(),';
             $PPer_checkout = TRUE;
         } else {
             $md_setter = '';
             $PPer_checkout = FALSE;
         }
         // Update the projects database with the updated info
         mysql_query("\n                UPDATE projects SET\n                    {$pm_setter}\n                    {$tlcc_setter}\n                    {$md_setter}\n                    {$common_project_settings}\n                WHERE projectid='{$this->projectid}'\n            ") or die(mysql_error());
         $details1 = implode(' ', $changed_fields);
         if ($details1 == '') {
             // There are no changed fields.
             // Don't just save '' for the details1 column,
             // because then do_history() won't be able to distinguish
             // this case (no changed fields) from old cases
             // (edit occurred before we started recording changed fields).
             // Instead, use a special value.
             $details1 = 'NONE';
         }
         $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'edit', $details1);
         if (!empty($e)) {
             die($e);
         }
         if ($PPer_checkout) {
             // we fake the project transition...
             $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'transition', PROJ_POST_FIRST_CHECKED_OUT, PROJ_POST_FIRST_CHECKED_OUT, $this->checkedoutby);
             if (!empty($e)) {
                 die($e);
             }
         }
         // Update the MARC record with any info we've received.
         $project = new Project($this->projectid);
         $marc_record = $project->load_marc_record();
         $this->update_marc_record_from_post($marc_record);
         $project->save_marc_record($marc_record);
     } else {
         // We are creating a new project
         $this->projectid = uniqid("projectID");
         // The project ID
         if ('' == $pm_setter) {
             $pm_setter = "username = '******',";
         }
         // Insert a new row into the projects table
         mysql_query("\n                INSERT INTO projects\n                SET\n                    projectid    = '{$this->projectid}',\n                    {$pm_setter}\n                    state        = '" . PROJ_NEW . "',\n                    modifieddate = UNIX_TIMESTAMP(),\n                    t_last_change_comments = UNIX_TIMESTAMP(),\n                    {$common_project_settings}\n            ") or die(mysql_error());
         $e = log_project_event($this->projectid, $GLOBALS['pguser'], 'creation');
         if (!empty($e)) {
             die($e);
         }
         $e = project_allow_pages($this->projectid);
         if (!empty($e)) {
             die($e);
         }
         // Make a directory in the projects_dir for this project
         mkdir("{$projects_dir}/{$this->projectid}", 0777) or die("System error: unable to mkdir '{$projects_dir}/{$this->projectid}'");
         chmod("{$projects_dir}/{$this->projectid}", 0777);
         // Do MARC record manipulations
         $project = new Project($this->projectid);
         $marc_record = new MARCRecord();
         // Save original MARC record, if provided
         $yaz_array = unserialize(base64_decode($this->original_marc_array_encd));
         if ($yaz_array !== FALSE) {
             $marc_record->load_yaz_array($yaz_array);
             $project->init_marc_record($marc_record);
             // Update the MARC record with data from POST
             $this->update_marc_record_from_post($marc_record);
             $project->save_marc_record($marc_record);
         }
         // Create the project's 'good word list' and 'bad word list'.
         if (isset($this->clone_projectid)) {
             // We're creating a project via cloning.
             // Copy the original project's word-lists.
             $good_words = load_project_good_words($this->clone_projectid);
             if (is_string($good_words)) {
                 // It's an error message.
                 echo "{$good_words}<br>\n";
                 $good_words = array();
             }
             $bad_words = load_project_bad_words($this->clone_projectid);
             if (is_string($bad_words)) {
                 // It's an error message.
                 echo "{$bad_words}<br>\n";
                 $bad_words = array();
             }
         } else {
             // We're creating a project by means other than cloning
             // (from_nothing, from_marc_record, from_uberproject).
             // Initialize its GWL and BWL to empty.
             $good_words = array();
             $bad_words = array();
         }
         save_project_good_words($this->projectid, $good_words);
         save_project_bad_words($this->projectid, $bad_words);
     }
     // Create/update the Dublin Core file for the project.
     // When we get here, the project's database entry has been fully
     // updated, so we can create a Project object and allow it
     // to pull the relevant fields from the database.
     $project = new Project($this->projectid);
     $project->create_dc_xml_oai($marc_record);
     // If the project has been posted to PG, make the appropriate transition.
     if ($this->posted) {
         $err = project_transition($this->projectid, PROJ_SUBMIT_PG_POSTED, $pguser);
         if ($err != '') {
             echo "{$err}<br>\n";
             exit;
         }
     }
 }
function _handle_action($action, $list_type, $language, $cutoff, $lang_match)
{
    $display_list = FALSE;
    switch ($action) {
        case "show":
            $word_freq = array();
            $total_projects = 0;
            $total_projects_with_words = 0;
            // figure out what kind of language matching we're going to use
            $where_clause = "";
            switch ($lang_match) {
                case "exact":
                    $where_clause = "language = '{$language}'";
                    break;
                case "primary":
                    $where_clause = "language like '{$language}%'";
                    break;
                case "any":
                    $where_clause = "language like '%{$language}%'";
                    break;
                default:
                    die("Unknown language match used: {$lang_match}");
            }
            // loop through all projects that use $language
            $res = mysql_query("\n                SELECT projectid\n                FROM projects\n                WHERE {$where_clause}\n            ");
            while (list($projectid) = mysql_fetch_row($res)) {
                if ($list_type == "good") {
                    $words = load_project_good_words($projectid);
                } elseif ($list_type == "bad") {
                    $words = load_project_bad_words($projectid);
                } else {
                    die("Unknown list type: {$list_type}");
                }
                foreach ($words as $word) {
                    @$word_freq[$word]++;
                }
                if (count($words)) {
                    $total_projects_with_words++;
                }
                $total_projects++;
            }
            mysql_free_result($res);
            // sort the results
            arsort($word_freq);
            // show the results
            echo "<pre>";
            echo _("Language") . ": {$language}<br>";
            echo sprintf(_("Language match type: %s"), $lang_match) . "<br>";
            echo sprintf(_("Word list type: %s"), $list_type) . "<br>";
            echo sprintf(_("Cutoff percentage: %d%%"), $cutoff) . "<br>";
            echo sprintf(_("Total projects matching language: %d"), $total_projects) . "<br>";
            echo sprintf(_("Total projects with word lists: %d"), $total_projects_with_words) . "<br>";
            echo "<br>";
            echo _("Note: Percentages are calculated as frequency over the total number of projects with word lists.") . "<br>";
            echo "<br>";
            echo sprintf("%20s  %5s  %s<br>", _("Word"), _("Count"), _("Frequency"));
            foreach ($word_freq as $word => $freq) {
                $percentage = $freq / $total_projects_with_words * 100;
                if ($percentage < $cutoff) {
                    break;
                }
                echo sprintf("%20s  %5d  (%-3.2f%%)<br>", $word, $freq, $percentage);
            }
            echo "</pre>";
            break;
        case "list":
            $display_list = TRUE;
            break;
        default:
            die("Invalid action encountered.");
    }
    return $display_list;
}
예제 #5
0
function merge_wordcheck_files($from_id, $to_id)
{
    global $projects_dir;
    // good words
    $from_words = load_project_good_words($from_id);
    $to_words = load_project_good_words($to_id);
    $to_words = array_merge($to_words, $from_words);
    save_project_good_words($to_id, $to_words);
    // crying out for some abstraction here?
    // bad words
    $from_words = load_project_bad_words($from_id);
    $to_words = load_project_bad_words($to_id);
    $to_words = array_merge($to_words, $from_words);
    save_project_bad_words($to_id, $to_words);
    // suggestions
    // the file format is complicated and may change
    // so we take the sledgehammer approach, as suggested by cpeel...
    $from_path = "{$projects_dir}/{$from_id}/good_word_suggestions.txt";
    if (!is_file($from_path)) {
        // The file does not exist.
        // Treat that the same as if it existed and was empty.
        $from_suggs = "";
    } else {
        $from_suggs = file_get_contents($from_path);
    }
    $to_path = "{$projects_dir}/{$to_id}/good_word_suggestions.txt";
    if (!is_file($to_path)) {
        // The file does not exist.
        // Treat that the same as if it existed and was empty.
        $to_suggs = "";
    } else {
        $to_suggs = file_get_contents($to_path);
    }
    file_put_contents($to_path, $to_suggs . $from_suggs);
    // we're assuming the projects are in unavailable or waiting, so there
    // is going to be no need to put locks on the files or anything fancy
}
 function set_from_files($load_good_words = true, $load_bad_words = true)
 {
     $errors = array();
     if ($load_good_words) {
         $gwl_object = get_project_word_file($this->projectid, "good");
         $this->gwl_timestamp = $gwl_object->mod_time;
         $good_words = load_project_good_words($this->projectid);
         if (is_string($good_words)) {
             array_push($errors, $good_words);
             $this->good_words = '';
         } else {
             $this->good_words = implode("\n", $good_words);
         }
     }
     if ($load_bad_words) {
         $bwl_object = get_project_word_file($this->projectid, "bad");
         $this->bwl_timestamp = $bwl_object->mod_time;
         $bad_words = load_project_bad_words($this->projectid);
         if (is_string($bad_words)) {
             array_push($errors, $bad_words);
             $this->bad_words = '';
         } else {
             $this->bad_words = implode("\n", $bad_words);
         }
     }
     return $errors;
 }