function toOutputEncoding($str) { global $config; $enc = detectCharacterEncoding($str); if ($enc !== null && function_exists('mb_convert_encoding')) { $str = mb_convert_encoding($str, 'UTF-8', $enc); } else { if ($enc !== null && function_exists('iconv')) { $str = iconv($enc, 'UTF-8//TRANSLIT//IGNORE', $str); } else { // @see http://w3.org/International/questions/qa-forms-utf-8.html $isUtf8 = preg_match('%^(?: [\\x09\\x0A\\x0D\\x20-\\x7E] # ASCII | [\\xC2-\\xDF][\\x80-\\xBF] # non-overlong 2-byte | \\xE0[\\xA0-\\xBF][\\x80-\\xBF] # excluding overlongs | [\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2} # straight 3-byte | \\xED[\\x80-\\x9F][\\x80-\\xBF] # excluding surrogates | \\xF0[\\x90-\\xBF][\\x80-\\xBF]{2} # planes 1-3 | [\\xF1-\\xF3][\\x80-\\xBF]{3} # planes 4-15 | \\xF4[\\x80-\\x8F][\\x80-\\xBF]{2} # plane 16 )*$%xs', $str); if (!$isUtf8) { $str = utf8_encode($str); } } } return $str; }
$sourceIDs = preg_replace("#(?<=^|\\s)(doi:|http://dx\\.doi\\.org/)#", "", $sourceIDs); $sourceIDs = preg_replace("#(?<=^|\\s)(openurl:|http://.+?(?=\\?))#", "", $sourceIDs); // Split on any whitespace between DOIs/OpenURLs: $idArray = preg_split("/\\s+/", $sourceIDs, -1, PREG_SPLIT_NO_EMPTY); // Try to retrieve information from PubMed.gov before querying CrossRef.org: // TODO: Test with $sourceIDs containing a mixture of DOIs and OpenURLs, as well as with $sourceIDs containing DOIs for articles listed in PubMed AND NOT listed in PubMed! if (preg_match("#10\\.\\d{4}/\\S+?(?=\$|\\s)#i", $sourceIDs)) { list($errors, $sourceText, $idArray) = fetchDOIsFromPubMed($idArray); // function 'fetchDOIsFromPubMed()' is defined in 'import.inc.php' } if (!empty($idArray)) { // Fetch record metadata from CrossRef.org for all given DOIs/OpenURLs: list($errors, $sourceText) = fetchDataFromCrossRef($idArray, $sourceFormat); // function 'fetchDataFromCrossRef()' is defined in 'import.inc.php' // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($sourceText) == "UTF-8") { $sourceText = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $sourceText, "UTF-8"); } } else { $sourceFormat = "Pubmed Medline"; } } } // -------------------------------------------------------------------- // PARSE SOURCE TEXT: if (!empty($sourceText) and !empty($sourceFormat)) { // fetch the path/name of the import format file that's associated with the import format given in '$sourceFormat': $importFormatFile = getFormatFile($sourceFormat, "import"); // function 'getFormatFile()' is defined in 'include.inc.php()' if (!empty($importFormatFile)) { // Get all cite keys specified by the current user and build an array of uniquified cite keys ('$citeKeysArray')
function arxivToRefbase(&$feed, $importRecordsRadio, $importRecordNumbersArray) { global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php' global $contentTypeCharset; // defined in 'ini.inc.php' global $errors; global $showSource; // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data: // (Notes: - name standardization occurs after multiple author fields have been merged by '; ' // - the split pattern must be specified as perl-style regular expression (including the leading & trailing // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match)) $personDelimiter = "/ *; */"; // Pattern by which a person's family name is separated from the given name (or initials): // (the split pattern must be specified as perl-style regular expression (including the leading & trailing // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match)) $familyNameGivenNameDelimiter = "/ (?=([{$upper}]+[-{$alpha}]+)( *;|\$))/{$patternModifiers}"; // Specifies whether the person's family name comes first within a person's name // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials)) $familyNameFirst = false; // Specifies whether a person's full given name(s) shall be shortened to initial(s): // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc) // - if set to 'false', given names (and any initials) are taken as is // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output) $shortenGivenNames = true; // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'): $transformCase = true; // Postprocessor actions: // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element: // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.) // "/Search Pattern/" => "Replace Pattern" $postprocessorActionsArray = array(array('fields' => array("title", "abstract", "notes"), 'actions' => array("/ *[\n\r]+ */" => " ")), array('fields' => array("title"), 'actions' => array("/[,.;:!] *\$/" => ""))); // ----------------------------------------- // PROCESS SOURCE DATA: // Initialize array variables: $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported // NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them: $importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format $importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format // Use these namespaces to retrieve tags: $atomNamespace = 'http://www.w3.org/2005/Atom'; $opensearchNamespace = 'http://a9.com/-/spec/opensearch/1.1/'; $arxivNamespace = 'http://arxiv.org/schemas/atom'; // Get feed data: $recordArray = $feed->get_items(); // fetch all feed items into an array $recordsCount = count($recordArray); // count how many records are available // ----------------------------------------- // LOOP OVER EACH RECORD: for ($i = 0; $i < $recordsCount; $i++) { $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record) $record = $recordArray[$i]; // this will make things a bit more readable // Check for any errors: if ($record->get_title() == "Error") { $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized $arXivError = $record->get_description(); // e.g. "incorrect id format for 1234.12345" // Prepare an appropriate error message: $errorMessage = "Record " . ($i + 1) . ": " . $arXivError . "!"; if (!isset($errors["sourceText"])) { $errors["sourceText"] = $errorMessage; } else { $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage; } } elseif (!$record->get_permalink()) { $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized // Prepare an appropriate error message: $errorMessage = "Record " . ($i + 1) . ": nothing found!"; if (!isset($errors["sourceText"])) { $errors["sourceText"] = $errorMessage; } else { $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage; } } else { // NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok: $importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number) // Extract elements of the current Atom XML entry: // - type: $fieldParametersArray['type'] = 'Journal Article'; // NOTE: Are all arXiv records journal articles? TODO: find what isn't & fix the type // - id: $fieldParametersArray['notes'] = str_replace("http://arxiv.org/abs/", "arXiv:", $record->get_permalink()); // extract the arXiv ID from the abstract URL in the 'id' element & prefix it with "arXiv:" // - title: $fieldParametersArray['title'] = $record->get_title(); // - summary: if ($abstract = $record->get_description()) { $fieldParametersArray['abstract'] = $abstract; } // - author: // NOTE: If we didn't want to extract author affiliation info, we could just use standard SimplePie functions ('get_authors()' and 'get_name()') $authorsArray = array(); $addressArray = array(); $authors = $record->get_item_tags($atomNamespace, 'author'); foreach ($authors as $author) { $authorName = ""; $authorLastName = ""; $authorAddressArray = ""; if (isset($author['child'][$atomNamespace]['name']) and $authorName = $author['child'][$atomNamespace]['name'][0]['data']) { // -- name: // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: // NOTE: For authors, we need to perform charset conversion up here (and not further down below, as is done for all the other fields), // since otherwise the below '$upper' and '$alpha' character class elements would fail to match! if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($authorName) == "UTF-8") { // function 'detectCharacterEncoding()' is defined in 'include.inc.php' $authorName = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorName, "UTF-8"); } // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' // Change the formatting of author names to the one used by refbase, i.e. the family name comes first, and a comma separates family name & initials: // (further standardisation of person names is done in function 'standardizeFieldData()'; see also note for '$familyNameGivenNameDelimiter' above) // NOTE: With the above settings for '$familyNameGivenNameDelimiter' and '$familyNameFirst' this isn't necessary anymore // $authorName = preg_replace("/^(.+?) +([$upper]+[-$alpha]+)$/$patternModifiers", "\\2, \\1", $authorName); $authorsArray[] = $authorName; // -- arxiv:affiliation: if (isset($author['child'][$arxivNamespace]) and $authorAffiliations = $author['child'][$arxivNamespace]['affiliation']) { foreach ($authorAffiliations as $authorAffiliation) { $authorAddressArray[] = $authorAffiliation['data']; } $authorAddresses = implode(", ", $authorAddressArray); // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($authorAddresses) == "UTF-8") { $authorAddresses = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorAddresses, "UTF-8"); } $authorLastName = preg_replace("/^([{$upper}]+[-{$alpha}]+).+\$/{$patternModifiers}", "\\1", $authorName); // extract authors last name $addressArray[] = $authorLastName . ": " . $authorAddresses; } } } if (!empty($authorsArray)) { $fieldParametersArray['author'] = implode("; ", $authorsArray); } // merge multiple authors if (!empty($addressArray)) { $fieldParametersArray['address'] = implode("; ", $addressArray); } // merge multiple author affiliations // - links: // // TODO: Currently, we just copy a link to the PDF to the 'file' field. It might be desirable to fetch the actual PDF and store it on the refbase server. // // NOTE: - In order to extract any links, we access the raw SimplePie object here; This is done since, in SimplePie v1.1.1, the standard SimplePie functions // 'get_link()' and 'get_links()' only support checking for the 'rel' attribute, but don't allow to filter on the 'type' or 'title' attribute. However, // we need to check the 'type' & 'title' attributes in order to assign PDF & DOI links to the 'file' & 'doi' fields, respectively. Alternatively, we // could also get this information from the URL itself, but that may fail if arXiv changes its URL pattern. // - More info on how to grab custom tags or attributes: <http://simplepie.org/wiki/tutorial/grab_custom_tags_or_attributes> $links = $record->get_item_tags($atomNamespace, 'link'); foreach ($links as $link) { if (isset($link['attribs']['']['href'])) { // -- file: if (!isset($fieldParametersArray['file']) and isset($link['attribs']['']['title']) and $link['attribs']['']['title'] == "pdf") { // we could also check for 'type="application/pdf"' $fieldParametersArray['file'] = $link['attribs']['']['href']; } elseif (!isset($fieldParametersArray['url']) and isset($link['attribs']['']['type']) and $link['attribs']['']['type'] == "text/html") { // we could also check for 'title' being unset $fieldParametersArray['url'] = $link['attribs']['']['href']; } elseif (!isset($fieldParametersArray['doi']) and isset($link['attribs']['']['title']) and $link['attribs']['']['title'] == "doi") { $fieldParametersArray['doi'] = str_replace("http://dx.doi.org/", "", $link['attribs']['']['href']); } } } // - arxiv:comment: if ($comment = $record->get_item_tags($arxivNamespace, 'comment')) { $fieldParametersArray['notes'] .= "; " . $comment[0]['data']; } // TODO: if arXiv records can include multiple comments, we'd need to loop over all of them // - arxiv:primary_category: // TODO: Should we copy the term given in the 'arxiv:primary_category' element to the 'area' field? // - arxiv:category: $categoriesArray = array(); foreach ($record->get_categories() as $category) { $categoriesArray[] = $category->get_label(); } if (!empty($categoriesArray)) { $fieldParametersArray['keywords'] = implode("; ", $categoriesArray); } // merge multiple categories // - arxiv:journal_ref: if ($journalRef = $record->get_item_tags($arxivNamespace, 'journal_ref')) { // We extract the full 'journal_ref' string into its own variable since we're going to mess with it: $journalRefData = preg_replace("/ *[\n\r]+ */", " ", $journalRef[0]['data']); // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space // NOTE: The formatting of the 'journal_ref' string can vary heavily, so // the below parsing efforts may fail. Therefore, we'll also copy the // original 'journal_ref' string to the 'notes' field, and display it // in the header message when importing single records. $fieldParametersArray['source'] = $journalRefData; $fieldParametersArray['notes'] .= "; Journal Ref: " . $journalRefData; // Extract source info from the 'journal_ref' string into the different fields: // NOTE: We try to use reasonably liberal (and thus rather ugly!) regex patterns // which should catch most of the commonly used formatting styles. However, // as noted above, due to the varying formatting of the 'journal_ref' string, // this may not be always entirely successful. // TODO: Extract ISSN from the 'journal_ref' string (see e.g. 'arXiv:cond-mat/0506611v1') // -- journal: $journalName = preg_replace("/^(.+?)(?= *(\\(?\\d+|[,;]|(v(ol)?\\.?|volume) *\\d+|\$)).*/i", "\\1", $journalRefData); // extract journal name $journalRefData = preg_replace("/^(.+?)(?= *(\\(?\\d+|[,;]|(v(ol)?\\.?|volume) *\\d+|\$))[,; ]*/i", "", $journalRefData); // remove journal name from 'journal_ref' string if (preg_match("/\\./", $journalName)) { $fieldParametersArray['abbrev_journal'] = preg_replace("/(?<=\\.)(?![ )]|\$)/", " ", $journalName); } else { $fieldParametersArray['publication'] = $journalName; } // -- volume: // NOTE: The volume is assumed to be the first number that follows the journal name, and // which is followed by another four-digit number (which is asssumed to be the year). if (preg_match("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4})/i", $journalRefData)) { $fieldParametersArray['volume'] = preg_replace("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4}).*/i", "\\1", $journalRefData); // extract volume $journalRefData = preg_replace("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4})[,; ]*/i", "", $journalRefData); // remove volume from 'journal_ref' string } // -- year (take 1): // NOTE: For the first take, we assume the year to be the first occurrence of a four-digit number // that's wrapped in parentheses. if (preg_match("/\\(\\d{4}\\)/i", $journalRefData)) { $fieldParametersArray['year'] = preg_replace("/^.*?\\((\\d{4})\\).*?\$/i", "\\1", $journalRefData); // extract year $journalRefData = preg_replace("/[,; ]*\\(\\d{4}\\)[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string } // -- issue: // NOTE: The issue is only recognized if it is preceded with a "n/no/number" prefix, or if it is a // number with less than four digits that is enclosed in parentheses (we can check for the latter // case since four-digit years that are wrapped in parens have already been removed). The regex // patterns below also try to account for some non-digit characters in issue numbers. // TODO: Support issue extraction from "Journal Vol:No ..." format (see e.g. 'arXiv:cond-mat/0703452v2') if (preg_match("/(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\)/i", $journalRefData)) { $fieldParametersArray['issue'] = preg_replace("/^.*?(?:(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\)).*?\$/i", "\\1\\2", $journalRefData); // extract issue $journalRefData = preg_replace("/[,; ]*(?:(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\))[,; ]*/i", "", $journalRefData); // remove issue from 'journal_ref' string } // -- pages (take 1): // NOTE: For the first take, we assume the pages to be either preceded with a "p/pp" prefix, or to // be a page range. if (preg_match("/(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)/i", $journalRefData)) { $fieldParametersArray['startPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)).*?\$/i", "\\1\\3", $journalRefData); // extract starting page $fieldParametersArray['endPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)).*?\$/i", "\\2\\4", $journalRefData); // extract ending page $journalRefData = preg_replace("/[,; ]*(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*))[,; ]*/i", "", $journalRefData); // remove page info from 'journal_ref' string } // -- year (take 2): // NOTE: For the second take, we assume the year to be the first occurrence of any four-digit number // in the remaining 'journal_ref' string. if (!isset($fieldParametersArray['year']) and preg_match("/\\b\\d{4}\\b/i", $journalRefData)) { $fieldParametersArray['year'] = preg_replace("/^.*?\\b(\\d{4})\\b.*?\$/i", "\\1", $journalRefData); // extract year $journalRefData = preg_replace("/[,; ]*\\b\\d{4}\\b[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string } // -- pages (take 2): // NOTE: For the second take, we assume the page info to be any number that is at the beginning of // the remaining 'journal_ref' string. if (!isset($fieldParametersArray['startPage']) and preg_match("/^[,; ]*\\w*\\d+\\w*/i", $journalRefData)) { $fieldParametersArray['startPage'] = preg_replace("/^[,; ]*(\\w*\\d+\\w*).*?\$/i", "\\1", $journalRefData); // extract page info } } // Standardize field data contained in '$fieldParametersArray': foreach ($fieldParametersArray as $fieldKey => $fieldData) { // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: // (we exclude the 'author' and 'address' fields here since they have already been dealt with above) if (!preg_match("/^(author|address)\$/", $fieldKey) and $contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($fieldData) == "UTF-8") { $fieldData = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $fieldData, "UTF-8"); } // Decode HTML special chars: if ($fieldKey != "url" and preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData)) { $fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); } elseif ($fieldKey == "url" and preg_match('/&/', $fieldData)) { // in case of the 'url' field, we just decode any ampersand characters $fieldParametersArray[$fieldKey] = str_replace('&', '&', $fieldData); } } // Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference: $fieldParametersArray = standardizeFieldData($fieldParametersArray, "arXiv XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray); // Append the array of extracted field data to the main data array which holds all records to import: $parsedRecordsArray[] = $fieldParametersArray; } } // ----------------------------------------- // Build refbase import array: $importDataArray = buildImportArray("refbase", "1.0", "http://refbase.net/import/arxiv/", "Matthias Steffens", "*****@*****.**", array('prefix_call_number' => "true"), $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields) return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors); }