Normalize a string in an unknown (non-UTF8) encoding into a valid UTF-8 sequence
static public utf8_normalize ( $str ) : string | ||
$str | string input string | |
return | string |
/** * Sanitize a variable. * Removes leading and trailing whitespace, normalizes all characters to UTF-8. * @param $var string * @return string */ static function cleanVar($var) { // only normalize strings that are not UTF-8 already, and when the system is using UTF-8 if (Config::getVar('i18n', 'charset_normalization') == 'On' && strtolower_codesafe(Config::getVar('i18n', 'client_charset')) == 'utf-8' && !PKPString::utf8_is_valid($var)) { $var = PKPString::utf8_normalize($var); // convert HTML entities into valid UTF-8 characters (do not transcode) $var = html_entity_decode($var, ENT_COMPAT, 'UTF-8'); // strip any invalid UTF-8 sequences $var = PKPString::utf8_bad_strip($var); $var = htmlspecialchars($var, ENT_NOQUOTES, 'UTF-8', false); } // strip any invalid ASCII control characters $var = PKPString::utf8_strip_ascii_ctrl($var); return trim($var); }
/** * @copydoc Filter::process() * @param $input string * @return MetadataDescription */ function &process(&$input) { $citationString =& $input; $nullVar = null; // Check the availability of perl $perlCommand = Config::getVar('cli', 'perl'); if (empty($perlCommand) || !file_exists($perlCommand)) { return $nullVar; } // Convert to ASCII - Paracite doesn't handle UTF-8 well $citationString = PKPString::utf8_to_ascii($citationString); // Call the paracite parser $wrapperScript = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'paracite.pl'; $paraciteCommand = $perlCommand . ' ' . escapeshellarg($wrapperScript) . ' ' . $this->getCitationModule() . ' ' . escapeshellarg($citationString); $xmlResult = shell_exec($paraciteCommand); if (empty($xmlResult)) { return $nullVar; } if (Config::getVar('i18n', 'charset_normalization') == 'On' && !PKPString::utf8_compliant($xmlResult)) { $xmlResult = PKPString::utf8_normalize($xmlResult); } // Create a temporary DOM document $resultDOM = new DOMDocument(); $resultDOM->recover = true; $resultDOM->loadXML($xmlResult); // Extract the parser results as an array $xmlHelper = new XMLHelper(); $metadata = $xmlHelper->xmlToArray($resultDOM->documentElement); // We have to merge subtitle and title as neither OpenURL // nor NLM can handle subtitles. if (isset($metadata['subtitle'])) { $metadata['title'] .= '. ' . $metadata['subtitle']; unset($metadata['subtitle']); } // Break up the authors field if (isset($metadata['authors'])) { $metadata['authors'] = PKPString::trimPunctuation($metadata['authors']); $metadata['authors'] = PKPString::iterativeExplode(array(':', ';'), $metadata['authors']); } // Convert pages to integers foreach (array('spage', 'epage') as $pageProperty) { if (isset($metadata[$pageProperty])) { $metadata[$pageProperty] = (int) $metadata[$pageProperty]; } } // Convert titles to title case foreach (array('title', 'chapter', 'publication') as $titleProperty) { if (isset($metadata[$titleProperty])) { $metadata[$titleProperty] = PKPString::titleCase($metadata[$titleProperty]); } } // Map ParaCite results to OpenURL - null means // throw the value away. $metadataMapping = array('genre' => 'genre', '_class' => null, 'any' => null, 'authors' => 'au', 'aufirst' => 'aufirst', 'aufull' => null, 'auinit' => 'auinit', 'aulast' => 'aulast', 'atitle' => 'atitle', 'cappublication' => null, 'captitle' => null, 'date' => 'date', 'epage' => 'epage', 'featureID' => null, 'id' => null, 'issue' => 'issue', 'jnl_epos' => null, 'jnl_spos' => null, 'match' => null, 'marked' => null, 'num_of_fig' => null, 'pages' => 'pages', 'publisher' => 'pub', 'publoc' => 'place', 'ref' => null, 'rest_text' => null, 'spage' => 'spage', 'targetURL' => 'url', 'text' => null, 'ucpublication' => null, 'uctitle' => null, 'volume' => 'volume', 'year' => 'date'); // Ignore 'year' if 'date' is set if (isset($metadata['date'])) { $metadataMapping['year'] = null; } // Set default genre if (empty($metadata['genre'])) { $metadata['genre'] = OPENURL10_GENRE_ARTICLE; } // Handle title, chapter and publication depending on // the (inferred) genre. Also instantiate the target schema. switch ($metadata['genre']) { case OPENURL10_GENRE_BOOK: case OPENURL10_GENRE_BOOKITEM: case OPENURL10_GENRE_REPORT: case OPENURL10_GENRE_DOCUMENT: $metadataMapping += array('publication' => 'btitle', 'chapter' => 'atitle'); if (isset($metadata['title'])) { if (!isset($metadata['publication'])) { $metadata['publication'] = $metadata['title']; } elseif (!isset($metadata['chapter'])) { $metadata['chapter'] = $metadata['title']; } unset($metadata['title']); } $openurl10SchemaName = 'lib.pkp.plugins.metadata.openurl10.schema.Openurl10BookSchema'; $openurl10SchemaClass = 'Openurl10BookSchema'; break; case OPENURL10_GENRE_ARTICLE: case OPENURL10_GENRE_JOURNAL: case OPENURL10_GENRE_ISSUE: case OPENURL10_GENRE_CONFERENCE: case OPENURL10_GENRE_PROCEEDING: case OPENURL10_GENRE_PREPRINT: default: $metadataMapping += array('publication' => 'jtitle'); if (isset($metadata['title'])) { if (!isset($metadata['publication'])) { $metadata['publication'] = $metadata['title']; } elseif (!isset($metadata['atitle'])) { $metadata['atitle'] = $metadata['title']; } unset($metadata['title']); } $openurl10SchemaName = 'lib.pkp.plugins.metadata.openurl10.schema.Openurl10JournalSchema'; $openurl10SchemaClass = 'Openurl10JournalSchema'; break; } // Instantiate an OpenURL description $openurl10Description = new MetadataDescription($openurl10SchemaName, ASSOC_TYPE_CITATION); $openurl10Schema = new $openurl10SchemaClass(); // Map the ParaCite result to OpenURL foreach ($metadata as $paraciteElementName => $paraciteValue) { if (!empty($paraciteValue)) { // Trim punctuation if (is_string($paraciteValue)) { $paraciteValue = PKPString::trimPunctuation($paraciteValue); } // Transfer the value to the OpenURL result array assert(array_key_exists($paraciteElementName, $metadataMapping)); $openurl10PropertyName = $metadataMapping[$paraciteElementName]; if (!is_null($openurl10PropertyName) && $openurl10Schema->hasProperty($openurl10PropertyName)) { if (is_array($paraciteValue)) { foreach ($paraciteValue as $singleValue) { $success = $openurl10Description->addStatement($openurl10PropertyName, $singleValue); assert($success); } } else { $success = $openurl10Description->addStatement($openurl10PropertyName, $paraciteValue); assert($success); } } } } // Crosswalk to NLM $crosswalkFilter = new Openurl10Nlm30CitationSchemaCrosswalkFilter(); $nlm30Description =& $crosswalkFilter->execute($openurl10Description); assert(is_a($nlm30Description, 'MetadataDescription')); // Add 'rest_text' as NLM comment (if given) if (isset($metadata['rest_text'])) { $nlm30Description->addStatement('comment', PKPString::trimPunctuation($metadata['rest_text'])); } // Set display name and sequence id in the meta-data description // to the corresponding values from the filter. This is important // so that we later know which result came from which filter. $nlm30Description->setDisplayName($this->getDisplayName()); $nlm30Description->setSequence($this->getSequence()); return $nlm30Description; }
/** * Take a citation string and clean/normalize it * @param $citationString string * @return string */ function _cleanCitationString($citationString) { // 1) If the string contains non-UTF8 characters, convert it to UTF-8 if (Config::getVar('i18n', 'charset_normalization') && !PKPString::utf8_compliant($citationString)) { $citationString = PKPString::utf8_normalize($citationString); } // 2) Strip slashes and whitespace $citationString = trim(stripslashes($citationString)); // 3) Normalize whitespace $citationString = PKPString::regexp_replace('/[\\s]+/', ' ', $citationString); return $citationString; }
/** * Call a web service * @param $webServiceRequest WebServiceRequest * @return string the result of the web service or null in case of an error. */ function &call(&$webServiceRequest) { assert(is_a($webServiceRequest, 'WebServiceRequest')); $usePut = false; switch ($webServiceRequest->getMethod()) { case 'PUT': $usePut = true; case 'POST': if ($webServiceRequest->getAsync()) { $result = $this->_callPostWebServiceAsync($webServiceRequest, $usePut); } else { $result = $this->_callPostWebService($webServiceRequest, $usePut); } break; case 'GET': $result = $this->_callGetWebService($webServiceRequest); break; default: // TODO: implement DELETE assert(false); } // Catch web service errors $nullVar = null; if (!$result) { return $nullVar; } if ($this->_lastResponseStatus >= 400 && $this->_lastResponseStatus <= 599) { return $nullVar; } // Clean the result $result = stripslashes($result); if (Config::getVar('i18n', 'charset_normalization') == 'On' && !PKPString::utf8_compliant($result)) { $result = PKPString::utf8_normalize($result); } return $result; }
/** * Parse an XML file using the specified handler. * If no handler has been specified, XMLParserDOMHandler is used by default, returning a tree structure representing the document. * @param $file string full path to the XML file * @param $dataCallback mixed Optional callback for data handling: function dataCallback($operation, $wrapper, $data = null) * @return object actual return type depends on the handler */ function &parse($file, $dataCallback = null) { $parser =& $this->createParser(); if (!isset($this->handler)) { // Use default handler for parsing $handler = new XMLParserDOMHandler(); $this->setHandler($handler); } xml_set_object($parser, $this->handler); xml_set_element_handler($parser, "startElement", "endElement"); xml_set_character_data_handler($parser, "characterData"); import('lib.pkp.classes.file.FileWrapper'); $wrapper =& FileWrapper::wrapper($file); // Handle responses of various types while (true) { $newWrapper = $wrapper->open(); if (is_object($newWrapper)) { // Follow a redirect unset($wrapper); $wrapper =& $newWrapper; unset($newWrapper); } elseif (!$newWrapper) { // Could not open resource -- error $returner = false; return $returner; } else { // OK, we've found the end result break; } } if (!$wrapper) { $result = false; return $result; } if ($dataCallback) { call_user_func($dataCallback, 'open', $wrapper); } while (!$wrapper->eof() && ($data = $wrapper->read()) !== false) { // if the string contains non-UTF8 characters, convert it to UTF-8 for parsing if (Config::getVar('i18n', 'charset_normalization') == 'On' && !PKPString::utf8_compliant($data)) { $utf8_last = PKPString::substr($data, PKPString::strlen($data) - 1); // if the string ends in a "bad" UTF-8 character, maybe it's truncated while (!$wrapper->eof() && PKPString::utf8_bad_find($utf8_last) === 0) { // read another chunk of data $data .= $wrapper->read(); $utf8_last = PKPString::substr($data, PKPString::strlen($data) - 1); } $data = PKPString::utf8_normalize($data); // strip any invalid UTF-8 sequences $data = PKPString::utf8_bad_strip($data); // convert named entities to numeric entities $data = strtr($data, PKPString::getHTMLEntities()); } // strip any invalid ASCII control characters $data = PKPString::utf8_strip_ascii_ctrl($data); if ($dataCallback) { call_user_func($dataCallback, 'parse', $wrapper, $data); } if (!xml_parse($parser, $data, $wrapper->eof())) { $this->addError(xml_error_string(xml_get_error_code($parser))); } } if ($dataCallback) { call_user_func($dataCallback, 'close', $wrapper); } $wrapper->close(); $result = $this->handler->getResult(); $this->destroyParser($parser); if (isset($handler)) { $handler->destroy(); } return $result; }