The calculation is optimized to identify the common
largest substring.
The return value is an array of the following format:
array(
array( diff-type => substring ),
array(...)
)
whereby diff-type can be one of:
-1 = deletion
0 = common substring
1 = addition
/** * @covers PKPString::diff */ public function testDiff() { // Test two strings that have common substrings. $originalString = 'The original string.'; $editedString = 'The edited original.'; $expectedDiff = array(array(0 => 'The'), array(1 => ' edited'), array(0 => ' original'), array(-1 => ' string'), array(0 => '.')); $resultDiff = PKPString::diff($originalString, $editedString); self::assertEquals($expectedDiff, $resultDiff); // Test two completely different strings. $originalString = 'abc'; $editedString = 'def'; $expectedDiff = array(array(-1 => 'abc'), array(1 => 'def')); $resultDiff = PKPString::diff($originalString, $editedString); self::assertEquals($expectedDiff, $resultDiff); // A more realistic example from the citation editor use case $originalString = 'Willinsky, B. (2006). The access principle: The case for open acces to research and scholarship. Cambridge, MA: MIT Press.'; $editedString = 'Willinsky, J. (2006). The access principle: The case for open access to research and scholarship. Cambridge, MA: MIT Press.'; $expectedDiff = array(array(0 => 'Willinsky, '), array(-1 => 'B'), array(1 => 'J'), array(0 => '. (2006). The access principle: The case for open acce'), array(1 => 's'), array(0 => 's to research and scholarship. Cambridge, MA: MIT Press.')); $resultDiff = PKPString::diff($originalString, $editedString); self::assertEquals($expectedDiff, $resultDiff); }
/** * Derive a confidence score calculated as the similarity of the * original raw citation and the citation text generated from the * citation description. * @param $metadataDescription MetadataDescription * @return integer filter confidence score */ function _filterConfidenceScore(&$metadataDescription) { // Retrieve the original plain text citation. $originalCitation = $this->getOriginalRawCitation(); // Generate the formatted citation output from the description. $citationOutputFilter =& $this->getCitationOutputFilter(); $generatedCitation = $citationOutputFilter->execute($metadataDescription); // Strip formatting and the Google Scholar tag so that we get a plain // text string that is comparable with the raw citation. $generatedCitation = trim(str_replace(GOOGLE_SCHOLAR_TAG, '', strip_tags($generatedCitation))); // Compare the original to the generated citation. $citationDiff = PKPString::diff($originalCitation, $generatedCitation); // Calculate similarity as the number of deleted characters in relation to the // number of characters in the original citation. This intentionally excludes // additions as these can represent useful data like a DOI or an external link. $deletedCharacters = 0; foreach ($citationDiff as $diffPart) { // Identify deletions. if (key($diffPart) == -1) { $deletedCharacters += PKPString::strlen(current($diffPart)); } } $originalCharacters = PKPString::strlen($originalCitation); $partOfCommonCharacters = ($originalCharacters - $deletedCharacters) / $originalCharacters; $filterConfidenceScore = (int) round(min($partOfCommonCharacters * 100, 100)); return $filterConfidenceScore; }