Ejemplo n.º 1
0
}
if (preg_match('/^https?:\\/\\//', $name2)) {
    print '<li><a href="' . htmlspecialchars($name2) . '">' . htmlspecialchars($name2) . '</a></li>';
} else {
    print '<li>' . htmlspecialchars($name2) . '</li>';
}
print '</ul>';
ini_set('display_errors', 1);
error_reporting(E_ALL | E_STRICT);
print "<p>";
$terms1 = get_terms($name1, $source1, $filecontents1, $removenumbers, $removequotations);
$terms2 = get_terms($name2, $source2, $filecontents2, $removenumbers, $removequotations);
# print("terms1: " . join(',', $terms1) . "\n");
# print("terms2: " . join(',', $terms2) . "\n");
$terms1_posts = compute_posts($terms1, $minwords);
$matches1 = compute_matches($terms1, $terms2, $terms1_posts, $minwords);
print "Total match candidates found: " . count($matches1) . " (before eliminating redundant matches)</p>";
print "</p>";
usort($matches1, 'cmp_by_length_desc');
$already_matched_phrases = (array) null;
$num_matches = 0;
$context_words = 30;
$min_context_words = 6;
$max_context_words = 20;
print "<p>Matched phrases:</p>\n";
foreach ($matches1 as $value) {
    list($pos1, $pos2, $length, $phrase) = $value;
    $skip = 0;
    foreach ($already_matched_phrases as $already_phrase) {
        if (strpos($already_phrase, $phrase) !== false) {
            $skip = 1;
Ejemplo n.º 2
0
/**
 * from https://github.com/wikigit/Duplication-Detector
 *
 * Copyright (c) 2011, Derrick Coetzee (User:Dcoetzee)
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 *  * Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
function do_copycat($formatter, $params = array())
{
    global $DBInfo;
    $charset = $DBInfo->charset;
    //ini_set('display_errors',1);
    //error_reporting(E_ALL|E_STRICT);
    // Get starting time to measure time elapsed later.
    $time_start = microtime_float();
    $source1 = 'Downloaded';
    if (!empty($params['source'])) {
        $source1 = 'Text';
        $name1 = 'source';
        $filecontents1 = $params['source'];
    } else {
        if (!empty($params['url0']) && preg_match("@^https?://@", $params['url0'])) {
            $name1 = $params['url0'];
            $filecontents1 = wget($params['url0']);
        } else {
            if (isset($_FILES['file0']) && $_FILES['file0']['tmp_name']) {
                $source1 = 'Uploaded';
                $name1 = $_FILES['file0']['name'];
                $filecontents1 = read_file_contents($_FILES['file0']['tmp_name']);
            } else {
                $source1 = 'Wiki';
                $name1 = $formatter->page->name;
                $filecontents1 = $formatter->page->_get_raw_body();
            }
        }
    }
    $source2 = 'Downloaded';
    if (!empty($params['url']) && preg_match("@^https?://@", $params['url'])) {
        $name2 = $params['url'];
        $filecontents2 = wget($params['url']);
    } else {
        if (isset($_FILES['file']) && $_FILES['file']['tmp_name']) {
            $source2 = 'Uploaded';
            $name2 = $_FILES['file']['name'];
            $filecontents2 = read_file_contents($_FILES['file']['tmp_name']);
        } else {
            if (!empty($params['target'])) {
                $source2 = 'Text';
                $name2 = 'target';
                $filecontents2 = $params['target'];
            } else {
                $params['.title'] = _("Copycat detector");
                $formatter->send_header('', $params);
                $formatter->send_title('', '', $params);
                $comp_btn = _("Detect");
                $diff_btn = _("Diff");
                $raw_check = _("Raw");
                $source_lab = _("Source (URL or PageName)");
                $target_lab = _("Target (URL)");
                $min_num_words = _("Minimum number of words");
                $min_num_chars = _("Minimum number of characters");
                $remove_quotes = _("Remove quotations");
                $remove_number = _("Remove numbers");
                $or = _("or");
                echo <<<FORM
<form name="comparesimple" method="post">
<input type="hidden" name="action" value="copycat" />
{$source_lab}: <input style="width: 99%" type="text" name="url0" value="{$name1}" size="120"><br/>
{$or} <br />
<textarea style="width: 100%; height: 408px; box-sizing: border-box" name="source" cols="80" rows="10">
</textarea>
<br/>
{$target_lab}: <input style="width: 99%" type="text" name="url" size="120"><br />
{$or} <br />
<textarea style="width: 100%; height: 408px; box-sizing: border-box" name="target" cols="80" rows="10">
</textarea>
<br/>
{$min_num_words}: <input type="text" name="minwords" value="2"><br/>
{$min_num_chars}: <input type="text" name="minchars" value="13"><br/>
<input type="checkbox" name="removequotations" checked="checked"> {$remove_quotes}<br/>
<input type="checkbox" name="removenumbers" checked="checked"> {$remove_number}<br/>
<br/>
<input type="submit" name="btn_compare" value="{$comp_btn}" /> {$or}
<input type="submit" name="btn_diff" value="{$diff_btn}" /> <input type="checkbox" name="raw" /> {$raw_check}
</form>
FORM;
                $formatter->send_footer('', $params);
                return;
            }
        }
    }
    $shorturl1 = htmlspecialchars(shorten_url($name1));
    $shorturl2 = htmlspecialchars(shorten_url($name2));
    $title = "Duplicate Detector: {$shorturl1} v {$shorturl2}";
    $button_diff = !empty($params['btn_diff']) ? true : false;
    if ($button_diff) {
        $diff = get_diff($filecontents1, $filecontents2, $params);
        if (!empty($params['raw'])) {
            header("Content-Type: text/plain");
            echo $diff;
            return;
        }
        $params['.title'] = "Diff: {$shorturl1} v {$shorturl2}";
        $formatter->send_header('', $params);
        $formatter->send_title('', '', $params);
        if (isset($diff[0])) {
            echo "<div id='wikiDiffPreview'>\n";
            echo $formatter->processor_repl('diff', $diff, $params);
            echo "</div>\n";
        }
        $formatter->send_footer('', $params);
        return;
    }
    $minwords = $params['minwords'];
    if (!$minwords || $minwords < 2) {
        $minwords = 2;
    }
    $minchars = $params['minchars'];
    if (!$minchars) {
        $minchars = 13;
    }
    $removequotations = $params['removequotations'];
    $removenumbers = $params['removenumbers'];
    $params['.title'] = $title;
    $formatter->send_header('', $params);
    $formatter->send_title('', '', $params);
    print '<p><b>Warning</b>: Duplication Detector may in some cases give no results or incomplete results. This does not necessarily indicate copying has not occurred. Manually examine the source document to verify.</p>';
    print "<h2>Comparing documents for duplicated text:</h2>";
    print '<ul>';
    if (preg_match('/^https?:\\/\\//', $name1)) {
        print '<li><a href="' . htmlspecialchars($name1) . '">' . htmlspecialchars($name1) . '</a></li>';
    } else {
        print '<li>' . htmlspecialchars($name1) . '</li>';
    }
    if (preg_match('/^https?:\\/\\//', $name2)) {
        print '<li><a href="' . htmlspecialchars($name2) . '">' . htmlspecialchars($name2) . '</a></li>';
    } else {
        print '<li>' . htmlspecialchars($name2) . '</li>';
    }
    print '</ul>';
    echo "<p>", "\n";
    $terms1 = get_terms($name1, $source1, $filecontents1, $removenumbers, $removequotations, $charset);
    $terms2 = get_terms($name2, $source2, $filecontents2, $removenumbers, $removequotations, $charset);
    // print("terms1: " . join(',', $terms1) . "\n");
    // print("terms2: " . join(',', $terms2) . "\n");
    $terms1_posts = compute_posts($terms1, $minwords);
    $matches1 = compute_matches($terms1, $terms2, $terms1_posts, $minwords);
    echo "<h3>Total match candidates found: ", count($matches1), "</h3>", "\n";
    echo " (before eliminating redundant matches)</p>", "\n";
    echo "</p>", "\n";
    usort($matches1, 'cmp_by_length_desc');
    $already_matched_phrases = (array) null;
    $num_matches = 0;
    $context_words = 30;
    $min_context_words = 6;
    $max_context_words = 20;
    echo "<h2>Matched phrases:</h2>", "\n";
    foreach ($matches1 as $value) {
        list($pos1, $pos2, $length, $phrase) = $value;
        $skip = 0;
        foreach ($already_matched_phrases as $already_phrase) {
            if (strpos($already_phrase, $phrase) !== false) {
                $skip = 1;
                break;
            }
        }
        $characters = mb_strlen($phrase, $charset);
        if (!$skip && $characters >= $minchars) {
            if ($length > $context_words - $min_context_words) {
                print "<p><ins class='diff-added'>{$phrase}</ins>" . " ({$length} words, {$characters} characters)</p>";
            } else {
                $context_len = ($context_words - $length) / 2;
                if ($context_len * 2 >= $max_context_words) {
                    $context_len = $max_context_words / 2;
                }
                $phraseprefix1 = join(' ', array_slice($terms1, max($pos1 - $context_len, 0), $pos1 - max($pos1 - $context_len, 0)));
                $phrasesuffix1 = join(' ', array_slice($terms1, $pos1 + $length, min(count($terms1) - ($pos1 + $length), $context_len)));
                $phraseprefix2 = join(' ', array_slice($terms2, max($pos2 - $context_len, 0), $pos2 - max($pos2 - $context_len, 0)));
                $phrasesuffix2 = join(' ', array_slice($terms2, $pos2 + $length, min(count($terms2) - ($pos2 + $length), $context_len)));
                print "<p>{$phraseprefix1} <ins class='diff-added'>{$phrase}</ins> {$phrasesuffix1}<br/>";
                print "{$phraseprefix2} <ins class='diff-added'>{$phrase}</ins> {$phrasesuffix2}<br/>";
                print "({$length} words, {$characters} characters)</p>";
            }
            $num_matches++;
        }
        $already_matched_phrases[] = $phrase;
    }
    echo "<h3>Matching phrases found: ", $num_matches, "<h3>\n";
    $time_delta = microtime_float() - $time_start;
    echo '<p>', 'Elapsed time:', $time_delta, '</p>';
    $formatter->send_footer('', $params);
    return;
}