} if (preg_match('/^https?:\\/\\//', $name2)) { print '<li><a href="' . htmlspecialchars($name2) . '">' . htmlspecialchars($name2) . '</a></li>'; } else { print '<li>' . htmlspecialchars($name2) . '</li>'; } print '</ul>'; ini_set('display_errors', 1); error_reporting(E_ALL | E_STRICT); print "<p>"; $terms1 = get_terms($name1, $source1, $filecontents1, $removenumbers, $removequotations); $terms2 = get_terms($name2, $source2, $filecontents2, $removenumbers, $removequotations); # print("terms1: " . join(',', $terms1) . "\n"); # print("terms2: " . join(',', $terms2) . "\n"); $terms1_posts = compute_posts($terms1, $minwords); $matches1 = compute_matches($terms1, $terms2, $terms1_posts, $minwords); print "Total match candidates found: " . count($matches1) . " (before eliminating redundant matches)</p>"; print "</p>"; usort($matches1, 'cmp_by_length_desc'); $already_matched_phrases = (array) null; $num_matches = 0; $context_words = 30; $min_context_words = 6; $max_context_words = 20; print "<p>Matched phrases:</p>\n"; foreach ($matches1 as $value) { list($pos1, $pos2, $length, $phrase) = $value; $skip = 0; foreach ($already_matched_phrases as $already_phrase) { if (strpos($already_phrase, $phrase) !== false) { $skip = 1;
/** * from https://github.com/wikigit/Duplication-Detector * * Copyright (c) 2011, Derrick Coetzee (User:Dcoetzee) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ function do_copycat($formatter, $params = array()) { global $DBInfo; $charset = $DBInfo->charset; //ini_set('display_errors',1); //error_reporting(E_ALL|E_STRICT); // Get starting time to measure time elapsed later. $time_start = microtime_float(); $source1 = 'Downloaded'; if (!empty($params['source'])) { $source1 = 'Text'; $name1 = 'source'; $filecontents1 = $params['source']; } else { if (!empty($params['url0']) && preg_match("@^https?://@", $params['url0'])) { $name1 = $params['url0']; $filecontents1 = wget($params['url0']); } else { if (isset($_FILES['file0']) && $_FILES['file0']['tmp_name']) { $source1 = 'Uploaded'; $name1 = $_FILES['file0']['name']; $filecontents1 = read_file_contents($_FILES['file0']['tmp_name']); } else { $source1 = 'Wiki'; $name1 = $formatter->page->name; $filecontents1 = $formatter->page->_get_raw_body(); } } } $source2 = 'Downloaded'; if (!empty($params['url']) && preg_match("@^https?://@", $params['url'])) { $name2 = $params['url']; $filecontents2 = wget($params['url']); } else { if (isset($_FILES['file']) && $_FILES['file']['tmp_name']) { $source2 = 'Uploaded'; $name2 = $_FILES['file']['name']; $filecontents2 = read_file_contents($_FILES['file']['tmp_name']); } else { if (!empty($params['target'])) { $source2 = 'Text'; $name2 = 'target'; $filecontents2 = $params['target']; } else { $params['.title'] = _("Copycat detector"); $formatter->send_header('', $params); $formatter->send_title('', '', $params); $comp_btn = _("Detect"); $diff_btn = _("Diff"); $raw_check = _("Raw"); $source_lab = _("Source (URL or PageName)"); $target_lab = _("Target (URL)"); $min_num_words = _("Minimum number of words"); $min_num_chars = _("Minimum number of characters"); $remove_quotes = _("Remove quotations"); $remove_number = _("Remove numbers"); $or = _("or"); echo <<<FORM <form name="comparesimple" method="post"> <input type="hidden" name="action" value="copycat" /> {$source_lab}: <input style="width: 99%" type="text" name="url0" value="{$name1}" size="120"><br/> {$or} <br /> <textarea style="width: 100%; height: 408px; box-sizing: border-box" name="source" cols="80" rows="10"> </textarea> <br/> {$target_lab}: <input style="width: 99%" type="text" name="url" size="120"><br /> {$or} <br /> <textarea style="width: 100%; height: 408px; box-sizing: border-box" name="target" cols="80" rows="10"> </textarea> <br/> {$min_num_words}: <input type="text" name="minwords" value="2"><br/> {$min_num_chars}: <input type="text" name="minchars" value="13"><br/> <input type="checkbox" name="removequotations" checked="checked"> {$remove_quotes}<br/> <input type="checkbox" name="removenumbers" checked="checked"> {$remove_number}<br/> <br/> <input type="submit" name="btn_compare" value="{$comp_btn}" /> {$or} <input type="submit" name="btn_diff" value="{$diff_btn}" /> <input type="checkbox" name="raw" /> {$raw_check} </form> FORM; $formatter->send_footer('', $params); return; } } } $shorturl1 = htmlspecialchars(shorten_url($name1)); $shorturl2 = htmlspecialchars(shorten_url($name2)); $title = "Duplicate Detector: {$shorturl1} v {$shorturl2}"; $button_diff = !empty($params['btn_diff']) ? true : false; if ($button_diff) { $diff = get_diff($filecontents1, $filecontents2, $params); if (!empty($params['raw'])) { header("Content-Type: text/plain"); echo $diff; return; } $params['.title'] = "Diff: {$shorturl1} v {$shorturl2}"; $formatter->send_header('', $params); $formatter->send_title('', '', $params); if (isset($diff[0])) { echo "<div id='wikiDiffPreview'>\n"; echo $formatter->processor_repl('diff', $diff, $params); echo "</div>\n"; } $formatter->send_footer('', $params); return; } $minwords = $params['minwords']; if (!$minwords || $minwords < 2) { $minwords = 2; } $minchars = $params['minchars']; if (!$minchars) { $minchars = 13; } $removequotations = $params['removequotations']; $removenumbers = $params['removenumbers']; $params['.title'] = $title; $formatter->send_header('', $params); $formatter->send_title('', '', $params); print '<p><b>Warning</b>: Duplication Detector may in some cases give no results or incomplete results. This does not necessarily indicate copying has not occurred. Manually examine the source document to verify.</p>'; print "<h2>Comparing documents for duplicated text:</h2>"; print '<ul>'; if (preg_match('/^https?:\\/\\//', $name1)) { print '<li><a href="' . htmlspecialchars($name1) . '">' . htmlspecialchars($name1) . '</a></li>'; } else { print '<li>' . htmlspecialchars($name1) . '</li>'; } if (preg_match('/^https?:\\/\\//', $name2)) { print '<li><a href="' . htmlspecialchars($name2) . '">' . htmlspecialchars($name2) . '</a></li>'; } else { print '<li>' . htmlspecialchars($name2) . '</li>'; } print '</ul>'; echo "<p>", "\n"; $terms1 = get_terms($name1, $source1, $filecontents1, $removenumbers, $removequotations, $charset); $terms2 = get_terms($name2, $source2, $filecontents2, $removenumbers, $removequotations, $charset); // print("terms1: " . join(',', $terms1) . "\n"); // print("terms2: " . join(',', $terms2) . "\n"); $terms1_posts = compute_posts($terms1, $minwords); $matches1 = compute_matches($terms1, $terms2, $terms1_posts, $minwords); echo "<h3>Total match candidates found: ", count($matches1), "</h3>", "\n"; echo " (before eliminating redundant matches)</p>", "\n"; echo "</p>", "\n"; usort($matches1, 'cmp_by_length_desc'); $already_matched_phrases = (array) null; $num_matches = 0; $context_words = 30; $min_context_words = 6; $max_context_words = 20; echo "<h2>Matched phrases:</h2>", "\n"; foreach ($matches1 as $value) { list($pos1, $pos2, $length, $phrase) = $value; $skip = 0; foreach ($already_matched_phrases as $already_phrase) { if (strpos($already_phrase, $phrase) !== false) { $skip = 1; break; } } $characters = mb_strlen($phrase, $charset); if (!$skip && $characters >= $minchars) { if ($length > $context_words - $min_context_words) { print "<p><ins class='diff-added'>{$phrase}</ins>" . " ({$length} words, {$characters} characters)</p>"; } else { $context_len = ($context_words - $length) / 2; if ($context_len * 2 >= $max_context_words) { $context_len = $max_context_words / 2; } $phraseprefix1 = join(' ', array_slice($terms1, max($pos1 - $context_len, 0), $pos1 - max($pos1 - $context_len, 0))); $phrasesuffix1 = join(' ', array_slice($terms1, $pos1 + $length, min(count($terms1) - ($pos1 + $length), $context_len))); $phraseprefix2 = join(' ', array_slice($terms2, max($pos2 - $context_len, 0), $pos2 - max($pos2 - $context_len, 0))); $phrasesuffix2 = join(' ', array_slice($terms2, $pos2 + $length, min(count($terms2) - ($pos2 + $length), $context_len))); print "<p>{$phraseprefix1} <ins class='diff-added'>{$phrase}</ins> {$phrasesuffix1}<br/>"; print "{$phraseprefix2} <ins class='diff-added'>{$phrase}</ins> {$phrasesuffix2}<br/>"; print "({$length} words, {$characters} characters)</p>"; } $num_matches++; } $already_matched_phrases[] = $phrase; } echo "<h3>Matching phrases found: ", $num_matches, "<h3>\n"; $time_delta = microtime_float() - $time_start; echo '<p>', 'Elapsed time:', $time_delta, '</p>'; $formatter->send_footer('', $params); return; }