function generateExport($result, $rowOffset, $showRows, $exportFormat, $exportType, $exportStylesheet, $displayType, $viewType, $userID) { global $officialDatabaseName; // these variables are defined in 'ini.inc.php' global $contentTypeCharset; global $convertExportDataToUTF8; global $defaultExportFormat; global $userOptionsArray; global $loc; // '$loc' is made globally available in 'core.php' // get all user options for the current user: // (note that '$userOptionsArray' is made globally available) $userOptionsArray = getUserOptions($userID); // function 'getUserOptions()' is defined in 'include.inc.php' // fetch the path/name of the export format file that's associated with the export format given in '$exportFormat': $exportFormatFile = getFormatFile($exportFormat, "export"); // function 'getFormatFile()' is defined in 'include.inc.php()' if (empty($exportFormatFile)) { $exportFormat = $defaultExportFormat; // if the given export format could not be found, we'll use the default export format which is defined by the '$defaultExportFormat' variable in 'ini.inc.php' $exportFormatFile = getFormatFile($exportFormat, "export"); } // include the found export format file *once*: include_once "export/" . $exportFormatFile; // instead of 'include_once' we could also use: 'if ($rowCounter == 0) { include "export/" . $exportFormatFile; }' // export found records using the specified export format: $exportText = exportRecords($result, $rowOffset, $showRows, $exportStylesheet, $displayType); // function 'exportRecords()' is defined in the export format file given in '$exportFormatFile' (which, in turn, must reside in the 'export' directory of the refbase root directory) // adjust the mime type and return exported data based on the key given in '$exportType': if (preg_match("/text/i", $exportType)) { $exportContentType = "text/plain"; } elseif (preg_match("/^(html|email)\$/i", $exportType)) { $exportContentType = "text/html"; } elseif (preg_match("/xml/i", $exportType)) { // NOTE: Firefox >=2.x, Safari >=2.x and IE >=7.x break client-side XSL for RSS and Atom feeds! // See e.g.: <http://decafbad.com/blog/2006/11/02/firefox-20-breaks-client-side-xsl-for-rss-and-atom-feeds> // TODO: Re-evaluate: As a consequence, we apply a VERY dirty hack in 'atomxml.inc.php' that prevents the feed sniffing // and subsequent browser applied default XSLT stylesheet that has been implemented by FireFox 2, Safari 2 // and Internet Explorer 7. To prevent the feed sniffing we insert a comment before the feed // element that is larger than 512 bytes. See: <http://feedme.mind-it.info/pivot/entry.php?id=9> // // For some browsers (such as the Camino browser <http://caminobrowser.org/>) it's possible to set the content type // to 'application/xml' which (while incorrect for Atom/RSS) will cause the browser to trigger their XML+XSLT renderer // if the Atom/RSS feed was requested together with a stylesheet. // // If the content type is set to 'application/atom+xml', Firefox 2 and Safari 2 will always apply their own default // XSLT stylesheet and ignore any client-side XSL transformation! if (preg_match("/Atom/i", $exportFormat) and empty($exportStylesheet)) { $exportContentType = "application/atom+xml"; } else { $exportContentType = "application/xml"; } } elseif (preg_match("/rss/i", $exportType)) { $exportContentType = "application/rss+xml"; } elseif (preg_match("/file/i", $exportType)) { $exportContentType = "text/plain"; // set the default mime type // Note that we do some "quick'n dirty" guessing for some export formats here (e.g., we assume/require that an XML export format name // contains 'XML' within its name!). This is in NO way fool proof and should be handled in a better way! if (preg_match("/XML/i", $exportFormat)) { if (preg_match("/Atom/i", $exportFormat)) { // if the export format name contains 'Atom' $exportContentType = "application/atom+xml"; } else { $exportContentType = "application/xml"; } if (preg_match("/Atom/i", $exportFormat)) { // if the export format name contains 'Atom' $exportFileName = "atom_export.xml"; } elseif (preg_match("/SRW_DC/i", $exportFormat)) { // if the export format name contains 'SRW_DC' $exportFileName = "srw_dc_export.xml"; } elseif (preg_match("/SRW_MODS/i", $exportFormat)) { // if the export format name contains 'SRW_MODS' $exportFileName = "srw_mods_export.xml"; } elseif (preg_match("/SRW/i", $exportFormat)) { // if the export format name contains 'SRW' (fallback) $exportFileName = "srw_export.xml"; } elseif (preg_match("/^MODS/i", $exportFormat)) { // if the export format name starts with 'MODS' (NOTE: the regex pattern must not match "SRW_MODS XML") $exportFileName = "mods_export.xml"; } elseif (preg_match("/^(OAI_)?DC/i", $exportFormat)) { // if the export format starts contains 'OAI_DC' or 'DC' (NOTE: the regex pattern must not match "SRW_DC XML") $exportFileName = "oaidc_export.xml"; } elseif (preg_match("/ODF|OpenDocument/i", $exportFormat)) { if (preg_match("/file/i", $exportType)) { $exportContentType = "application/vnd.oasis.opendocument.spreadsheet"; $exportFileName = "odf_export.ods"; } else { $exportFileName = "content.xml"; } } elseif (preg_match("/Word/i", $exportFormat)) { // if the export format name contains 'Word' $exportFileName = "msword_export.xml"; } else { $exportFileName = "export.xml"; } } elseif (preg_match("/ADS|BibTeX|Endnote|ISI|RIS/i", $exportFormat)) { if (preg_match("/ADS/i", $exportFormat)) { $exportFileName = "ads_export.txt"; } elseif (preg_match("/BibTeX/i", $exportFormat)) { $exportFileName = "bibtex_export.bib"; } elseif (preg_match("/Endnote/i", $exportFormat)) { $exportFileName = "endnote_export.enw"; } elseif (preg_match("/ISI/i", $exportFormat)) { $exportFileName = "isi_export.txt"; } elseif (preg_match("/RIS/i", $exportFormat)) { $exportFileName = "ris_export.ris"; } } else { $exportFileName = "exported_records.txt"; } // set the default download file name } // if variable '$convertExportDataToUTF8' is set to "yes" in 'ini.inc.php', we'll convert latin1 data to UTF-8 // when exporting to XML; therefore, we'll need to temporarily set the value of the global '$contentTypeCharset' // variable to UTF-8 which will ensure proper HTML output if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $oldContentTypeCharset = $contentTypeCharset; // remember the actual database charset $oldOfficialDatabaseName = $officialDatabaseName; // remember the database name as originally encoded // if the database charset is not "UTF-8" then we'll also need to temporarily convert any higher ASCII chars in variables which get included within the HTML output $officialDatabaseName = convertToCharacterEncoding("UTF-8", "IGNORE", $officialDatabaseName); // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' $contentTypeCharset = "UTF-8"; // for XML output we'll temporarily set the value of '$contentTypeCharset' to "UTF-8" } // set the appropriate mimetype & set the character encoding to the one given in '$contentTypeCharset': setHeaderContentType($exportContentType, $contentTypeCharset); // function 'setHeaderContentType()' is defined in 'include.inc.php' if (preg_match("/file/i", $exportType)) { // instruct the browser to download the resulting XML file: header('Content-Disposition: attachment; filename="' . $exportFileName . '"'); } elseif (preg_match("/^(html|email)\$/i", $exportType)) { if (preg_match("/email/i", $exportType)) { $emailRecipient = $_SESSION['loginEmail']; $emailSubject = "Your records from the " . $officialDatabaseName . " (exported to " . $exportFormat . " format)"; $emailBody = $exportText; sendEmail($emailRecipient, $emailSubject, $emailBody); // function 'sendEmail()' is defined in 'include.inc.php' } // call the 'displayHTMLhead()' function (defined in 'header.inc.php'): displayHTMLhead(encodeHTML($officialDatabaseName) . " -- Exported Data", "index,follow", "Data exported from the " . encodeHTML($officialDatabaseName), "", false, "", $viewType, array()); $exportText = "\n\t<pre>\n" . encodeHTML($exportText) . "\n\t</pre>\n</body>\n</html>\n"; if ($exportType == "email") { $exportText = "\n\t<p>" . "\n\t\t<a href=\"javascript:history.back()\" title=\"" . $loc["LinkTitle_GoBackToResults"] . "\">" . $loc["Go Back"] . "</a>" . "\n\t</p>" . "\n\t<p>" . "\n\t\t<b>The data below have been sent to <a href=\"mailto:" . $_SESSION['loginEmail'] . "\">" . $_SESSION['loginEmail'] . "</a>:</b>" . "\n\t</p>" . $exportText; } } if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $contentTypeCharset = $oldContentTypeCharset; // restore the actual database charset $officialDatabaseName = $oldOfficialDatabaseName; // restore the database name as originally encoded } if (preg_match("/ODF|OpenDocument/i", $exportFormat) && preg_match("/file/i", $exportType)) { // This is a dirty hack to zip and return an ODF file. // It may be desired to retun other non-textual formats in the future & to return these as attachments by email in the future. // If this becomes needed, we should refactor the output. $zipfile = zipODF($exportText); // function 'zipODF()' is defined in 'odfxml.inc.php' echo $zipfile->file(); } else { // we'll present the output within the _same_ browser window: // (note that we don't use a popup window here, since this may be blocked by particular browsers) echo $exportText; } }
$sourceIDs = preg_replace("#(?<=^|\\s)(openurl:|http://.+?(?=\\?))#", "", $sourceIDs); // Split on any whitespace between DOIs/OpenURLs: $idArray = preg_split("/\\s+/", $sourceIDs, -1, PREG_SPLIT_NO_EMPTY); // Try to retrieve information from PubMed.gov before querying CrossRef.org: // TODO: Test with $sourceIDs containing a mixture of DOIs and OpenURLs, as well as with $sourceIDs containing DOIs for articles listed in PubMed AND NOT listed in PubMed! if (preg_match("#10\\.\\d{4}/\\S+?(?=\$|\\s)#i", $sourceIDs)) { list($errors, $sourceText, $idArray) = fetchDOIsFromPubMed($idArray); // function 'fetchDOIsFromPubMed()' is defined in 'import.inc.php' } if (!empty($idArray)) { // Fetch record metadata from CrossRef.org for all given DOIs/OpenURLs: list($errors, $sourceText) = fetchDataFromCrossRef($idArray, $sourceFormat); // function 'fetchDataFromCrossRef()' is defined in 'import.inc.php' // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($sourceText) == "UTF-8") { $sourceText = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $sourceText, "UTF-8"); } } else { $sourceFormat = "Pubmed Medline"; } } } // -------------------------------------------------------------------- // PARSE SOURCE TEXT: if (!empty($sourceText) and !empty($sourceFormat)) { // fetch the path/name of the import format file that's associated with the import format given in '$sourceFormat': $importFormatFile = getFormatFile($sourceFormat, "import"); // function 'getFormatFile()' is defined in 'include.inc.php()' if (!empty($importFormatFile)) { // Get all cite keys specified by the current user and build an array of uniquified cite keys ('$citeKeysArray') // which is used to ensure uniqueness of generated cite keys among all imported records as well as the user's existing records:
function citeRecords($result, $rowsFound, $query, $queryURL, $showQuery, $showLinks, $rowOffset, $showRows, $previousOffset, $nextOffset, $wrapResults, $citeStyle, $citeOrder, $citeType, $orderBy, $headerMsg, $userID, $viewType) { global $contentTypeCharset; // defined in 'ini.inc.php' global $client; // The array '$transtab_refbase_rtf' contains search & replace patterns for conversion from refbase markup to RTF markup & entities global $transtab_refbase_rtf; // defined in 'transtab_refbase_rtf.inc.php' // Initialize array variables: $yearsArray = array(); $typeTitlesArray = array(); // Define inline text markup to be used by the 'citeRecord()' function: $markupPatternsArray = array("bold-prefix" => "{\\b ", "bold-suffix" => "}", "italic-prefix" => "{\\i ", "italic-suffix" => "}", "underline-prefix" => "{\\ul ", "underline-suffix" => "}", "endash" => "\\endash ", "emdash" => "\\emdash ", "ampersand" => "&", "double-quote" => '"', "double-quote-left" => "\\ldblquote ", "double-quote-right" => "\\rdblquote ", "single-quote" => "'", "single-quote-left" => "\\lquote ", "single-quote-right" => "\\rquote ", "less-than" => "<", "greater-than" => ">", "newline" => "\n{\\f1\\fs24 \\par}\n"); // Defines search & replace 'actions' that will be applied upon RTF output to all those refbase fields that are listed // in the corresponding 'fields' element: $rtfSearchReplaceActionsArray = array(array('fields' => array("title", "publication", "abbrev_journal", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), 'actions' => $transtab_refbase_rtf)); // For CLI queries, we'll allow paging thru the result set, i.e. we honour the values of the CLI options '-S|--start' ('$rowOffset') // and '-R|--rows' ('$showRows') ('$rowOffset' and '$showRows' are re-assigned in function 'seekInMySQLResultsToOffset()' in 'include.inc.php') if (preg_match("/^cli/i", $client)) { // if the query originated from a command line client such as the "refbase" CLI client ("cli-refbase-1.0") $showMaxRows = $showRows; } else { $showMaxRows = $rowsFound; } // otherwise show all rows // Setup the basic RTF document structure (RTF functions defined in 'MINIMALRTF.php'): $rtf = new MINIMALRTF(); // initialize RTF object $rtfData = $rtf->openRtf(); // create RTF opening tag $rtf->createFontBlock(0, "Arial"); // create & set RTF font blocks $rtf->createFontBlock(1, "Times New Roman"); $rtfData .= $rtf->setFontBlock(); // Header: if (!empty($headerMsg)) { // Remove any colon (":") from end of header message: $headerMsg = trimTextPattern($headerMsg, ":", false, true); // function 'trimTextPattern()' is defined in 'include.inc.php' // Decode any HTML entities: // (these may occur in the header message e.g. if the user's preferred display language is not English but German or French, etc) $headerMsg = decodeHTML($contentTypeCharset, $headerMsg); // function 'decodeHTML()' is defined in 'include.inc.php', and '$contentTypeCharset' is defined in 'ini.inc.php' // Convert refbase markup in the header message into appropriate RTF markup & entities: $headerMsg = searchReplaceText($transtab_refbase_rtf, $headerMsg, true); // function 'searchReplaceText()' is defined in 'include.inc.php' $rtfData .= "{\\header\\pard\\qc {$headerMsg}\\par}\n"; } $rtfData .= $rtf->justify("full", 0.5, 0, -0.5); // by default, we'll justify text and set a hanging indent (left indent: 0.5, right indent: 0, first-line indent: -0.5) // LOOP OVER EACH RECORD: // Fetch one page of results (or less if on the last page) // (i.e., upto the limit specified in $showMaxRows) fetch a row into the $row array and ... for ($rowCounter = 0; $rowCounter < $showMaxRows && ($row = @mysql_fetch_array($result)); $rowCounter++) { foreach ($row as $rowFieldName => $rowFieldValue) { // Apply search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$rtfSearchReplaceActionsArray': foreach ($rtfSearchReplaceActionsArray as $fieldActionsArray) { if (in_array($rowFieldName, $fieldActionsArray['fields'])) { $row[$rowFieldName] = searchReplaceText($fieldActionsArray['actions'], $row[$rowFieldName], true); } } } // function 'searchReplaceText()' is defined in 'include.inc.php' // Order attributes according to the chosen output style & record type: $record = citeRecord($row, $citeStyle, $citeType, $markupPatternsArray, false); // function 'citeRecord()' is defined in the citation style file given in '$citeStyleFile' (which, in turn, must reside in the 'cite' directory of the refbase root directory), see function 'generateCitations()' // Print out the current record: if (!empty($record)) { // Print any section heading(s): if (preg_match("/year|type/i", $citeOrder)) { $headingPrefix = $rtf->justify("left", 0, 0, 0) . $rtf->paragraph(0, 12); // create empty paragraph in front of heading using "Arial" (font block 0) and a font size of 12pt $headingSuffix = $rtf->justify("full", 0.5, 0, -0.5); // justify any following text and set a hanging indent (left indent: 0.5, right indent: 0, first-line indent: -0.5) if ($citeOrder == "type") { // for 'citeOrder=type' we'll always print an empty paragraph after the heading $headingSuffix .= $rtf->paragraph(0, 12); } // create empty paragraph using "Arial" (font block 0) and a font size of 12pt list($yearsArray, $typeTitlesArray, $sectionHeading) = generateSectionHeading($yearsArray, $typeTitlesArray, $row, $citeOrder, $headingPrefix, $headingSuffix, "{\\f0\\fs28 {\\b ", "}\\par}\n", "{\\f0\\fs24 {\\b ", "}\\par}\n"); // function 'generateSectionHeading()' is defined in 'cite.inc.php' // Note that we pass raw RTF commands to the above function instead of using the 'textBlock()' function from 'MINIMALRTF.php'. This is due to a current limitation of the 'generateSectionHeading()' function. // For 'citeOrder=year', the appropriate call to the 'textBlock()' function would look like this: // $rtfData .= $rtf->textBlock(0, 14, $rtf->bold($row['year'])); // create major heading with the current year using "Arial" (font block 0) and a font size of 14pt, printed in bold $rtfData .= $sectionHeading; } // If character encoding is not UTF-8 already, convert record text to UTF-8: if ($contentTypeCharset != "UTF-8") { $record = convertToCharacterEncoding("UTF-8", "IGNORE", $record); } // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' // Encode characters with an ASCII value of >= 128 in RTF 1.16 unicode format: $recordUnicodeCharEncoded = $rtf->utf8_2_unicode($record); // converts UTF-8 chars to unicode character codes // Write RTF paragraph: $rtfData .= $rtf->textBlock(1, 12, $recordUnicodeCharEncoded); // create text block with encoded record text using "Times New Roman" (font block 1) and a font size of 12pt } } $rtfData .= $rtf->closeRtf(); // create RTF closing tag return $rtfData; }
function encodeField($fieldName, $fieldValue, $localSearchReplaceActionsArray = array(), $encodingExceptionsArray = array(), $encode = true, $targetFormat = "HTML") { global $contentTypeCharset; // these variables are defined in 'ini.inc.php' global $convertExportDataToUTF8; global $searchReplaceActionsArray; if ($encode and !in_array($fieldName, $encodingExceptionsArray)) { if ($targetFormat == "HTML") { // Encode non-ASCII chars as HTML entities: $fieldValue = encodeHTML($fieldValue); } elseif ($targetFormat == "XML") { // Only convert those special chars to entities which are supported by XML: $fieldValue = encodeHTMLspecialchars($fieldValue); // Convert field data to UTF-8: if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $fieldValue = convertToCharacterEncoding("UTF-8", "IGNORE", $fieldValue); } } } // Apply *locally* defined search & replace 'actions' to all fields that are listed // in the 'fields' element of the arrays contained in '$localSearchReplaceActionsArray': foreach ($localSearchReplaceActionsArray as $fieldActionsArray) { if (in_array($fieldName, $fieldActionsArray['fields'])) { $fieldValue = searchReplaceText($fieldActionsArray['actions'], $fieldValue, true); } } if ($targetFormat == "HTML") { // Apply *globally* defined search & replace 'actions' to all fields that are listed // in the 'fields' element of the arrays contained in '$searchReplaceActionsArray': foreach ($searchReplaceActionsArray as $fieldActionsArray) { if (in_array($fieldName, $fieldActionsArray['fields'])) { $fieldValue = searchReplaceText($fieldActionsArray['actions'], $fieldValue, true); } } } return $fieldValue; }
function arxivToRefbase(&$feed, $importRecordsRadio, $importRecordNumbersArray) { global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php' global $contentTypeCharset; // defined in 'ini.inc.php' global $errors; global $showSource; // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data: // (Notes: - name standardization occurs after multiple author fields have been merged by '; ' // - the split pattern must be specified as perl-style regular expression (including the leading & trailing // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match)) $personDelimiter = "/ *; */"; // Pattern by which a person's family name is separated from the given name (or initials): // (the split pattern must be specified as perl-style regular expression (including the leading & trailing // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match)) $familyNameGivenNameDelimiter = "/ (?=([{$upper}]+[-{$alpha}]+)( *;|\$))/{$patternModifiers}"; // Specifies whether the person's family name comes first within a person's name // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials)) $familyNameFirst = false; // Specifies whether a person's full given name(s) shall be shortened to initial(s): // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc) // - if set to 'false', given names (and any initials) are taken as is // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output) $shortenGivenNames = true; // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'): $transformCase = true; // Postprocessor actions: // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element: // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.) // "/Search Pattern/" => "Replace Pattern" $postprocessorActionsArray = array(array('fields' => array("title", "abstract", "notes"), 'actions' => array("/ *[\n\r]+ */" => " ")), array('fields' => array("title"), 'actions' => array("/[,.;:!] *\$/" => ""))); // ----------------------------------------- // PROCESS SOURCE DATA: // Initialize array variables: $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported // NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them: $importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format $importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format // Use these namespaces to retrieve tags: $atomNamespace = 'http://www.w3.org/2005/Atom'; $opensearchNamespace = 'http://a9.com/-/spec/opensearch/1.1/'; $arxivNamespace = 'http://arxiv.org/schemas/atom'; // Get feed data: $recordArray = $feed->get_items(); // fetch all feed items into an array $recordsCount = count($recordArray); // count how many records are available // ----------------------------------------- // LOOP OVER EACH RECORD: for ($i = 0; $i < $recordsCount; $i++) { $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record) $record = $recordArray[$i]; // this will make things a bit more readable // Check for any errors: if ($record->get_title() == "Error") { $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized $arXivError = $record->get_description(); // e.g. "incorrect id format for 1234.12345" // Prepare an appropriate error message: $errorMessage = "Record " . ($i + 1) . ": " . $arXivError . "!"; if (!isset($errors["sourceText"])) { $errors["sourceText"] = $errorMessage; } else { $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage; } } elseif (!$record->get_permalink()) { $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized // Prepare an appropriate error message: $errorMessage = "Record " . ($i + 1) . ": nothing found!"; if (!isset($errors["sourceText"])) { $errors["sourceText"] = $errorMessage; } else { $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage; } } else { // NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok: $importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number) // Extract elements of the current Atom XML entry: // - type: $fieldParametersArray['type'] = 'Journal Article'; // NOTE: Are all arXiv records journal articles? TODO: find what isn't & fix the type // - id: $fieldParametersArray['notes'] = str_replace("http://arxiv.org/abs/", "arXiv:", $record->get_permalink()); // extract the arXiv ID from the abstract URL in the 'id' element & prefix it with "arXiv:" // - title: $fieldParametersArray['title'] = $record->get_title(); // - summary: if ($abstract = $record->get_description()) { $fieldParametersArray['abstract'] = $abstract; } // - author: // NOTE: If we didn't want to extract author affiliation info, we could just use standard SimplePie functions ('get_authors()' and 'get_name()') $authorsArray = array(); $addressArray = array(); $authors = $record->get_item_tags($atomNamespace, 'author'); foreach ($authors as $author) { $authorName = ""; $authorLastName = ""; $authorAddressArray = ""; if (isset($author['child'][$atomNamespace]['name']) and $authorName = $author['child'][$atomNamespace]['name'][0]['data']) { // -- name: // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: // NOTE: For authors, we need to perform charset conversion up here (and not further down below, as is done for all the other fields), // since otherwise the below '$upper' and '$alpha' character class elements would fail to match! if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($authorName) == "UTF-8") { // function 'detectCharacterEncoding()' is defined in 'include.inc.php' $authorName = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorName, "UTF-8"); } // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' // Change the formatting of author names to the one used by refbase, i.e. the family name comes first, and a comma separates family name & initials: // (further standardisation of person names is done in function 'standardizeFieldData()'; see also note for '$familyNameGivenNameDelimiter' above) // NOTE: With the above settings for '$familyNameGivenNameDelimiter' and '$familyNameFirst' this isn't necessary anymore // $authorName = preg_replace("/^(.+?) +([$upper]+[-$alpha]+)$/$patternModifiers", "\\2, \\1", $authorName); $authorsArray[] = $authorName; // -- arxiv:affiliation: if (isset($author['child'][$arxivNamespace]) and $authorAffiliations = $author['child'][$arxivNamespace]['affiliation']) { foreach ($authorAffiliations as $authorAffiliation) { $authorAddressArray[] = $authorAffiliation['data']; } $authorAddresses = implode(", ", $authorAddressArray); // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: if ($contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($authorAddresses) == "UTF-8") { $authorAddresses = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorAddresses, "UTF-8"); } $authorLastName = preg_replace("/^([{$upper}]+[-{$alpha}]+).+\$/{$patternModifiers}", "\\1", $authorName); // extract authors last name $addressArray[] = $authorLastName . ": " . $authorAddresses; } } } if (!empty($authorsArray)) { $fieldParametersArray['author'] = implode("; ", $authorsArray); } // merge multiple authors if (!empty($addressArray)) { $fieldParametersArray['address'] = implode("; ", $addressArray); } // merge multiple author affiliations // - links: // // TODO: Currently, we just copy a link to the PDF to the 'file' field. It might be desirable to fetch the actual PDF and store it on the refbase server. // // NOTE: - In order to extract any links, we access the raw SimplePie object here; This is done since, in SimplePie v1.1.1, the standard SimplePie functions // 'get_link()' and 'get_links()' only support checking for the 'rel' attribute, but don't allow to filter on the 'type' or 'title' attribute. However, // we need to check the 'type' & 'title' attributes in order to assign PDF & DOI links to the 'file' & 'doi' fields, respectively. Alternatively, we // could also get this information from the URL itself, but that may fail if arXiv changes its URL pattern. // - More info on how to grab custom tags or attributes: <http://simplepie.org/wiki/tutorial/grab_custom_tags_or_attributes> $links = $record->get_item_tags($atomNamespace, 'link'); foreach ($links as $link) { if (isset($link['attribs']['']['href'])) { // -- file: if (!isset($fieldParametersArray['file']) and isset($link['attribs']['']['title']) and $link['attribs']['']['title'] == "pdf") { // we could also check for 'type="application/pdf"' $fieldParametersArray['file'] = $link['attribs']['']['href']; } elseif (!isset($fieldParametersArray['url']) and isset($link['attribs']['']['type']) and $link['attribs']['']['type'] == "text/html") { // we could also check for 'title' being unset $fieldParametersArray['url'] = $link['attribs']['']['href']; } elseif (!isset($fieldParametersArray['doi']) and isset($link['attribs']['']['title']) and $link['attribs']['']['title'] == "doi") { $fieldParametersArray['doi'] = str_replace("http://dx.doi.org/", "", $link['attribs']['']['href']); } } } // - arxiv:comment: if ($comment = $record->get_item_tags($arxivNamespace, 'comment')) { $fieldParametersArray['notes'] .= "; " . $comment[0]['data']; } // TODO: if arXiv records can include multiple comments, we'd need to loop over all of them // - arxiv:primary_category: // TODO: Should we copy the term given in the 'arxiv:primary_category' element to the 'area' field? // - arxiv:category: $categoriesArray = array(); foreach ($record->get_categories() as $category) { $categoriesArray[] = $category->get_label(); } if (!empty($categoriesArray)) { $fieldParametersArray['keywords'] = implode("; ", $categoriesArray); } // merge multiple categories // - arxiv:journal_ref: if ($journalRef = $record->get_item_tags($arxivNamespace, 'journal_ref')) { // We extract the full 'journal_ref' string into its own variable since we're going to mess with it: $journalRefData = preg_replace("/ *[\n\r]+ */", " ", $journalRef[0]['data']); // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space // NOTE: The formatting of the 'journal_ref' string can vary heavily, so // the below parsing efforts may fail. Therefore, we'll also copy the // original 'journal_ref' string to the 'notes' field, and display it // in the header message when importing single records. $fieldParametersArray['source'] = $journalRefData; $fieldParametersArray['notes'] .= "; Journal Ref: " . $journalRefData; // Extract source info from the 'journal_ref' string into the different fields: // NOTE: We try to use reasonably liberal (and thus rather ugly!) regex patterns // which should catch most of the commonly used formatting styles. However, // as noted above, due to the varying formatting of the 'journal_ref' string, // this may not be always entirely successful. // TODO: Extract ISSN from the 'journal_ref' string (see e.g. 'arXiv:cond-mat/0506611v1') // -- journal: $journalName = preg_replace("/^(.+?)(?= *(\\(?\\d+|[,;]|(v(ol)?\\.?|volume) *\\d+|\$)).*/i", "\\1", $journalRefData); // extract journal name $journalRefData = preg_replace("/^(.+?)(?= *(\\(?\\d+|[,;]|(v(ol)?\\.?|volume) *\\d+|\$))[,; ]*/i", "", $journalRefData); // remove journal name from 'journal_ref' string if (preg_match("/\\./", $journalName)) { $fieldParametersArray['abbrev_journal'] = preg_replace("/(?<=\\.)(?![ )]|\$)/", " ", $journalName); } else { $fieldParametersArray['publication'] = $journalName; } // -- volume: // NOTE: The volume is assumed to be the first number that follows the journal name, and // which is followed by another four-digit number (which is asssumed to be the year). if (preg_match("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4})/i", $journalRefData)) { $fieldParametersArray['volume'] = preg_replace("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4}).*/i", "\\1", $journalRefData); // extract volume $journalRefData = preg_replace("/^(?:(?:v(?:ol)?\\.?|volume) *)?(\\w*\\d+\\w*)(?= *.*?\\d{4})[,; ]*/i", "", $journalRefData); // remove volume from 'journal_ref' string } // -- year (take 1): // NOTE: For the first take, we assume the year to be the first occurrence of a four-digit number // that's wrapped in parentheses. if (preg_match("/\\(\\d{4}\\)/i", $journalRefData)) { $fieldParametersArray['year'] = preg_replace("/^.*?\\((\\d{4})\\).*?\$/i", "\\1", $journalRefData); // extract year $journalRefData = preg_replace("/[,; ]*\\(\\d{4}\\)[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string } // -- issue: // NOTE: The issue is only recognized if it is preceded with a "n/no/number" prefix, or if it is a // number with less than four digits that is enclosed in parentheses (we can check for the latter // case since four-digit years that are wrapped in parens have already been removed). The regex // patterns below also try to account for some non-digit characters in issue numbers. // TODO: Support issue extraction from "Journal Vol:No ..." format (see e.g. 'arXiv:cond-mat/0703452v2') if (preg_match("/(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\)/i", $journalRefData)) { $fieldParametersArray['issue'] = preg_replace("/^.*?(?:(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\)).*?\$/i", "\\1\\2", $journalRefData); // extract issue $journalRefData = preg_replace("/[,; ]*(?:(?:(?:n\\.|no\\.?|number) *)(\\w*[\\d\\/-]+\\w*)|\\((\\w*(?:\\d{1,3}|\\d{1,2}[\\/-]+\\d{1,2})\\w*)\\))[,; ]*/i", "", $journalRefData); // remove issue from 'journal_ref' string } // -- pages (take 1): // NOTE: For the first take, we assume the pages to be either preceded with a "p/pp" prefix, or to // be a page range. if (preg_match("/(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)/i", $journalRefData)) { $fieldParametersArray['startPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)).*?\$/i", "\\1\\3", $journalRefData); // extract starting page $fieldParametersArray['endPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*)).*?\$/i", "\\2\\4", $journalRefData); // extract ending page $journalRefData = preg_replace("/[,; ]*(?:(?:p(?:p)?\\.? *)(\\w*\\d+\\w*)(?: *-+ *(\\w*\\d+\\w*))?|(?:p(?:p)?\\.? *)?(\\w*\\d+\\w*) *-+ *(\\w*\\d+\\w*))[,; ]*/i", "", $journalRefData); // remove page info from 'journal_ref' string } // -- year (take 2): // NOTE: For the second take, we assume the year to be the first occurrence of any four-digit number // in the remaining 'journal_ref' string. if (!isset($fieldParametersArray['year']) and preg_match("/\\b\\d{4}\\b/i", $journalRefData)) { $fieldParametersArray['year'] = preg_replace("/^.*?\\b(\\d{4})\\b.*?\$/i", "\\1", $journalRefData); // extract year $journalRefData = preg_replace("/[,; ]*\\b\\d{4}\\b[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string } // -- pages (take 2): // NOTE: For the second take, we assume the page info to be any number that is at the beginning of // the remaining 'journal_ref' string. if (!isset($fieldParametersArray['startPage']) and preg_match("/^[,; ]*\\w*\\d+\\w*/i", $journalRefData)) { $fieldParametersArray['startPage'] = preg_replace("/^[,; ]*(\\w*\\d+\\w*).*?\$/i", "\\1", $journalRefData); // extract page info } } // Standardize field data contained in '$fieldParametersArray': foreach ($fieldParametersArray as $fieldKey => $fieldData) { // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1: // (we exclude the 'author' and 'address' fields here since they have already been dealt with above) if (!preg_match("/^(author|address)\$/", $fieldKey) and $contentTypeCharset == "ISO-8859-1" and detectCharacterEncoding($fieldData) == "UTF-8") { $fieldData = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $fieldData, "UTF-8"); } // Decode HTML special chars: if ($fieldKey != "url" and preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData)) { $fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); } elseif ($fieldKey == "url" and preg_match('/&/', $fieldData)) { // in case of the 'url' field, we just decode any ampersand characters $fieldParametersArray[$fieldKey] = str_replace('&', '&', $fieldData); } } // Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference: $fieldParametersArray = standardizeFieldData($fieldParametersArray, "arXiv XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray); // Append the array of extracted field data to the main data array which holds all records to import: $parsedRecordsArray[] = $fieldParametersArray; } } // ----------------------------------------- // Build refbase import array: $importDataArray = buildImportArray("refbase", "1.0", "http://refbase.net/import/arxiv/", "Matthias Steffens", "*****@*****.**", array('prefix_call_number' => "true"), $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields) return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors); }
function parseRecord($row, $odfIndexesToRefbaseFieldsArray, $referenceTypesToRefbaseTypesArray, $universalSearchReplaceActionsArray, $fieldSpecificSearchReplaceActionsArray) { global $officialDatabaseName; // these variables are defined in 'ini.inc.php' global $databaseBaseURL; global $contentTypeCharset; global $convertExportDataToUTF8; $fieldParametersArray = array(); // this is a stupid hack that maps the names of the '$row' array keys to those used // by the '$formVars' array (which is required by function 'generateCiteKey()') // (eventually, the '$formVars' array should use the MySQL field names as names for its array keys) $formVars = buildFormVarsArray($row); // function 'buildFormVarsArray()' is defined in 'include.inc.php' // generate or extract the cite key for this record $citeKey = generateCiteKey($formVars); // function 'generateCiteKey()' is defined in 'include.inc.php' // PARSE RECORD: // parse the '$odfIndexesToRefbaseFieldsArray' which maps ODF indexes to refbase field names and assign fields accordingly: foreach ($odfIndexesToRefbaseFieldsArray as $odfIndex => $refbaseField) { if (empty($odfIndexesToRefbaseFieldsArray[$odfIndex])) { $fieldParametersArray[$odfIndex] = ""; // for any unsupported ODF index we'll insert an empty string } else { // copy row field data to array of field parameters (using the corresponding ODF index name as element key): if (!is_array($odfIndexesToRefbaseFieldsArray[$odfIndex])) { if (!empty($refbaseField) and !empty($row[$refbaseField])) { $fieldParametersArray[$odfIndex] = $row[$refbaseField]; } } else { $useDefault = true; // ...we'll extract field data from different refbase fields depending on the current record's reference type: foreach ($odfIndexesToRefbaseFieldsArray[$odfIndex] as $referenceType => $refbaseField) { if ($row['type'] == $referenceType) { $useDefault = false; if (is_array($odfIndexesToRefbaseFieldsArray[$odfIndex][$referenceType])) { foreach ($odfIndexesToRefbaseFieldsArray[$odfIndex][$referenceType] as $refbaseField) { if (!empty($refbaseField) and !empty($row[$refbaseField])) { $fieldParametersArray[$odfIndex] = $row[$refbaseField]; break; } } } elseif (!empty($refbaseField) and !empty($row[$refbaseField])) { $fieldParametersArray[$odfIndex] = $row[$refbaseField]; } break; } } // 'Other' is used as default for all refbase types that were NOT explicitly specified: if ($useDefault and !isset($fieldParametersArray[$odfIndex]) and isset($odfIndexesToRefbaseFieldsArray[$odfIndex]['Other'])) { if (is_array($odfIndexesToRefbaseFieldsArray[$odfIndex]['Other'])) { foreach ($odfIndexesToRefbaseFieldsArray[$odfIndex]['Other'] as $refbaseField) { if (!empty($refbaseField) and !empty($row[$refbaseField])) { $fieldParametersArray[$odfIndex] = $row[$refbaseField]; break; } } } elseif (!empty($odfIndexesToRefbaseFieldsArray[$odfIndex]['Other']) and !empty($row[$odfIndexesToRefbaseFieldsArray[$odfIndex]['Other']])) { $fieldParametersArray[$odfIndex] = $row[$odfIndexesToRefbaseFieldsArray[$odfIndex]['Other']]; } } // if this ODF field is still not set, 'Any' is used as default, no matter whether any refbase types were specified explicitly or not: if (!isset($fieldParametersArray[$odfIndex]) and isset($odfIndexesToRefbaseFieldsArray[$odfIndex]['Any'])) { if (is_array($odfIndexesToRefbaseFieldsArray[$odfIndex]['Any'])) { foreach ($odfIndexesToRefbaseFieldsArray[$odfIndex]['Any'] as $refbaseField) { if (!empty($refbaseField) and !empty($row[$refbaseField])) { $fieldParametersArray[$odfIndex] = $row[$refbaseField]; break; } } } elseif (!empty($odfIndexesToRefbaseFieldsArray[$odfIndex]['Any']) and !empty($row[$odfIndexesToRefbaseFieldsArray[$odfIndex]['Any']])) { $fieldParametersArray[$odfIndex] = $row[$odfIndexesToRefbaseFieldsArray[$odfIndex]['Any']]; } } } // if this ODF field isn't set yet, provide an empty string: if (!isset($fieldParametersArray[$odfIndex])) { $fieldParametersArray[$odfIndex] = ""; } } } // POST-PROCESS FIELD DATA: // currently, we'll always overwrite the record serial in the 'Identifier' field with the generated cite key: // (this means that NO identifier will be exported if you've unchecked the export option "Include cite keys on export") $fieldParametersArray['Identifier'] = $citeKey; // convert refbase type names into ODF type numbers: $fieldParametersArray['BibliographyType'] = $referenceTypesToRefbaseTypesArray[$fieldParametersArray['BibliographyType']]; // for theses, set the correct ODF type: if (!empty($row['thesis'])) { if ($row['thesis'] == "Ph.D. thesis" or $row['thesis'] == "Doctoral thesis") { $fieldParametersArray['BibliographyType'] = "11"; } else { $fieldParametersArray['BibliographyType'] = "9"; } // Thesis if (isset($fieldParametersArray['Annote'])) { $fieldParametersArray['Annote'] .= "; " . $row['thesis']; } else { $fieldParametersArray['Annote'] = $row['thesis']; } } // if a DOI was copied to the URL field, we'll need to add the DOI resolver: if (!empty($row['doi']) and preg_match("/^\\d{2}\\.\\d{4}\\//", $fieldParametersArray['URL'])) { $fieldParametersArray['URL'] = "http://dx.doi.org/" . $fieldParametersArray['URL']; } // use the series volume as volume if 'series_volume' contains some info, but 'volume' doesn't: if (empty($row['volume']) and !empty($row['series_volume'])) { $fieldParametersArray['Volume'] = $row['series_volume']; } // set the fourth ODF custom field to a refbase database attribution string and the database URL: $fieldParametersArray['Custom4'] = "exported from " . $officialDatabaseName . " (" . $databaseBaseURL . ")"; // set the fifth ODF custom field to the record's permanent database URL: $fieldParametersArray['Custom5'] = $databaseBaseURL . "show.php?record=" . $row['serial']; // apply universal search & replace actions, encode special chars and charset conversions to every field that shall be exported: foreach ($fieldParametersArray as $fieldName => $fieldValue) { if (!empty($fieldValue)) { // perform universal search & replace actions: if (!empty($universalSearchReplaceActionsArray)) { $fieldParametersArray[$fieldName] = searchReplaceText($universalSearchReplaceActionsArray, $fieldParametersArray[$fieldName], true); } // function 'searchReplaceText()' is defined in 'include.inc.php' // we only convert those special chars to entities which are supported by XML: $fieldParametersArray[$fieldName] = encodeHTMLspecialchars($fieldParametersArray[$fieldName]); // function 'encodeHTMLspecialchars()' is defined in 'include.inc.php' // convert field data to UTF-8 (if '$convertExportDataToUTF8' is set to "yes" in 'ini.inc.php' and character encoding is not UTF-8 already): // (note that charset conversion can only be done *after* the cite key has been generated, otherwise cite key generation will produce garbled text!) if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $fieldParametersArray[$fieldName] = convertToCharacterEncoding("UTF-8", "IGNORE", $fieldParametersArray[$fieldName]); } // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' } } // apply field-specific search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$fieldSpecificSearchReplaceActionsArray': foreach ($fieldSpecificSearchReplaceActionsArray as $fieldActionsArray) { foreach ($fieldParametersArray as $fieldName => $fieldValue) { if (in_array($fieldName, $fieldActionsArray['fields'])) { $fieldParametersArray[$fieldName] = searchReplaceText($fieldActionsArray['actions'], $fieldValue, true); } } } // function 'searchReplaceText()' is defined in 'include.inc.php' return $fieldParametersArray; }
function atomGenerateBaseTags($atomOperation) { global $officialDatabaseName; // these variables are specified in 'ini.inc.php' global $databaseBaseURL; global $feedbackEmail; global $contentTypeCharset; global $convertExportDataToUTF8; global $logoImageURL; global $faviconImageURL; global $query; $atomCollection = new XML("feed"); $atomCollection->setTagAttribute("xmlns", "http://www.w3.org/2005/Atom"); $atomCollection->setTagAttribute("xmlns:opensearch", "http://a9.com/-/spec/opensearch/1.1/"); $atomCollection->setTagAttribute("xmlns:unapi", "http://unapi.info/"); // NOTE: is the unAPI namespace ok? Or should we use "http://unapi.info/specs/", or maybe something like "http://purl.org/unapi/ns/" ? $atomCollection->setTagAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/"); $atomCollection->setTagAttribute("xmlns:dcterms", "http://purl.org/dc/terms/"); $atomCollection->setTagAttribute("xmlns:prism", "http://prismstandard.org/namespaces/1.2/basic/"); $officialDatabaseNameConv = encodeHTMLspecialchars($officialDatabaseName); // function 'encodeHTMLspecialchars()' is defined in 'include.inc.php' if ($atomOperation != "Error") { // convert database name to UTF-8: // (if '$convertExportDataToUTF8' is set to "yes" in 'ini.inc.php' and character encoding is not UTF-8 already) if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $officialDatabaseNameConv = convertToCharacterEncoding("UTF-8", "IGNORE", $officialDatabaseNameConv); } // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' } // ---------------------------------------------------------- // Add feed-level tags: // (not yet used: category, contributor, rights) // - 'title': addNewBranch($atomCollection, "title", array("type" => "text"), $officialDatabaseNameConv); // - 'subtitle': if ($atomOperation == "Error") { addNewBranch($atomCollection, "subtitle", array(), "Search error!"); } else { // ...extract the 'WHERE' clause from the SQL query to include a natural-language version (well, sort of) within the 'subtitle' element: $queryWhereClause = extractWHEREclause($query); // function 'extractWHEREclause()' is defined in 'include.inc.php' // construct a meaningful feed description based on the actual 'WHERE' clause: // TODO: For Atom XML, the query string should not get HTML encoded! $subTitle = "Displays records where " . encodeHTML(explainSQLQuery($queryWhereClause)); // functions 'encodeHTML()' and 'explainSQLQuery()' are defined in 'include.inc.php' addNewBranch($atomCollection, "subtitle", array(), $subTitle); } // - 'updated': // (TODO: the timestamp in the 'updated' element should really only get updated if any of the matching records was updated, right?) addNewBranch($atomCollection, "updated", array(), generateISO8601TimeStamp()); // function 'generateISO8601TimeStamp()' is defined in 'include.inc.php' // - 'author': $authorBranch = new XMLBranch("author"); $authorBranch->setTagContent($officialDatabaseNameConv, "author/name"); $authorBranch->setTagContent($feedbackEmail, "author/email"); $authorBranch->setTagContent($databaseBaseURL, "author/uri"); $atomCollection->addXMLBranch($authorBranch); // - 'generator', 'icon', 'logo': addNewBranch($atomCollection, "generator", array("uri" => "http://www.refbase.net/", "version" => "0.9.5"), "Web Reference Database (http://refbase.sourceforge.net)"); addNewBranch($atomCollection, "icon", array(), $databaseBaseURL . $faviconImageURL); addNewBranch($atomCollection, "logo", array(), $databaseBaseURL . $logoImageURL); // - 'link' (more links will be added in function 'atomCollection()'): // - link to OpenSearch Description file: atomLink($atomCollection, $databaseBaseURL . "opensearch.php?operation=explain", "search", "OpenSearch", $officialDatabaseNameConv); // - link to unAPI server: atomLink($atomCollection, $databaseBaseURL . "unapi.php", "unapi:unapi-server", "unAPI", "unAPI"); return $atomCollection; }
function oaidcRecord($row, $metadataPrefix = "oai_dc", $addNameSpaceInfo = true) { global $databaseBaseURL; // these variables are defined in 'ini.inc.php' global $contentTypeCharset; global $fileVisibility; global $fileVisibilityException; global $filesBaseURL; global $convertExportDataToUTF8; global $defaultCiteStyle; global $citeStyle; global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php' // The array '$transtab_refbase_unicode' contains search & replace patterns for conversion from refbase markup to Unicode entities. global $transtab_refbase_unicode; // defined in 'transtab_refbase_unicode.inc.php' // The array '$transtab_refbase_ascii' contains search & replace patterns for conversion from refbase markup to plain text. global $transtab_refbase_ascii; // defined in 'transtab_refbase_ascii.inc.php' // Define inline text markup to generate a plain text citation string: // (to be included within a 'dcterms:bibliographicCitation' element) $markupPatternsArrayPlain = array("bold-prefix" => "", "bold-suffix" => "", "italic-prefix" => "", "italic-suffix" => "", "underline-prefix" => "", "underline-suffix" => "", "endash" => "-", "emdash" => "-", "ampersand" => "&", "double-quote" => '"', "double-quote-left" => '"', "double-quote-right" => '"', "single-quote" => "'", "single-quote-left" => "'", "single-quote-right" => "'", "less-than" => "<", "greater-than" => ">", "newline" => "\n"); // This is a stupid hack that maps the names of the '$row' array keys to those used // by the '$formVars' array (which is required by function 'generateCiteKey()') // (eventually, the '$formVars' array should use the MySQL field names as names for its array keys) $formVars = buildFormVarsArray($row); // function 'buildFormVarsArray()' is defined in 'include.inc.php' // Generate or extract the cite key for this record: // (to be included within a 'dc:identifier' element) $citeKey = generateCiteKey($formVars); // function 'generateCiteKey()' is defined in 'include.inc.php' // Generate OpenURL data: // (to be included within a 'dc:identifier' element) $openURL = openURL($row, "openurl:"); // function 'openURL()' is defined in 'openurl.inc.php' // Encode special chars and perform charset conversions: foreach ($row as $rowFieldName => $rowFieldValue) { // We only convert those special chars to entities which are supported by XML: // function 'encodeHTMLspecialchars()' is defined in 'include.inc.php' $row[$rowFieldName] = encodeHTMLspecialchars($row[$rowFieldName]); // Convert field data to UTF-8: // (if '$convertExportDataToUTF8' is set to "yes" in 'ini.inc.php' and character encoding is not UTF-8 already) // (Note that charset conversion can only be done *after* the cite key has been generated, otherwise cite key // generation will produce garbled text!) // function 'convertToCharacterEncoding()' is defined in 'include.inc.php' if ($convertExportDataToUTF8 == "yes" and $contentTypeCharset != "UTF-8") { $row[$rowFieldName] = convertToCharacterEncoding("UTF-8", "IGNORE", $row[$rowFieldName]); } } // Defines field-specific search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element: // (If you don't want to perform any search and replace actions, specify an empty array, like: '$fieldSpecificSearchReplaceActionsArray = array();'. // Note that the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.) // "/Search Pattern/" => "Replace Pattern" $fieldSpecificSearchReplaceActionsArray = array(); if ($convertExportDataToUTF8 == "yes") { $fieldSpecificSearchReplaceActionsArray[] = array('fields' => array("title", "publication", "abbrev_journal", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), 'actions' => $transtab_refbase_unicode); } // Apply field-specific search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$fieldSpecificSearchReplaceActionsArray': foreach ($fieldSpecificSearchReplaceActionsArray as $fieldActionsArray) { foreach ($row as $rowFieldName => $rowFieldValue) { if (in_array($rowFieldName, $fieldActionsArray['fields'])) { $row[$rowFieldName] = searchReplaceText($fieldActionsArray['actions'], $rowFieldValue, true); } } } // function 'searchReplaceText()' is defined in 'include.inc.php' // Fetch the name of the citation style file that's associated with the style given in '$citeStyle': $citeStyleFile = getStyleFile($citeStyle); // function 'getStyleFile()' is defined in 'include.inc.php' if (empty($citeStyleFile)) { $citeStyle = $defaultCiteStyle; // if the given cite style could not be found, we'll use the default cite style which is defined by the '$defaultCiteStyle' variable in 'ini.inc.php' $citeStyleFile = getStyleFile($citeStyle); } // Include the found citation style file *once*: include_once "cite/" . $citeStyleFile; // Generate a proper citation for this record, ordering attributes according to the chosen output style & record type: // - Plain text version of citation string: $recordCitationPlain = citeRecord($row, $citeStyle, "", $markupPatternsArrayPlain, false); // function 'citeRecord()' is defined in the citation style file given in '$citeStyleFile' (which, in turn, must reside in the 'styles' directory of the refbase root directory) // Convert any refbase markup that remains in the citation string (such as _italic_ or **bold**) to plain text: $recordCitationPlain = searchReplaceText($transtab_refbase_ascii, $recordCitationPlain, true); // Convert any remaining refbase markup in the 'title', 'keywords' & 'abstract' fields to plain text: $row['title'] = searchReplaceText($transtab_refbase_ascii, $row['title'], true); $row['keywords'] = searchReplaceText($transtab_refbase_ascii, $row['keywords'], true); $row['abstract'] = searchReplaceText($transtab_refbase_ascii, $row['abstract'], true); // Strip any " (ed)" or " (eds)" suffix from author/editor string: if (preg_match("/ *\\(eds?\\)\$/", $row['author'])) { $row['author'] = preg_replace("/[ \r\n]*\\(eds?\\)/i", "", $row['author']); } if (preg_match("/ *\\(eds?\\)\$/", $row['editor'])) { $row['editor'] = preg_replace("/[ \r\n]*\\(eds?\\)/i", "", $row['editor']); } // Include a link to any corresponding file if one of the following conditions is met: // - the variable '$fileVisibility' (defined in 'ini.inc.php') is set to 'everyone' // - the variable '$fileVisibility' is set to 'login' AND the user is logged in // - the variable '$fileVisibility' is set to 'user-specific' AND the 'user_permissions' session variable contains 'allow_download' // - the array variable '$fileVisibilityException' (defined in 'ini.inc.php') contains a pattern (in array element 1) that matches the contents of the field given (in array element 0) // // TODO: - the URL-generating code should be made into a dedicated function (since it's shared with 'modsxml.inc.php' and 'atomxml.inc.php') $printURL = false; if ($fileVisibility == "everyone" or $fileVisibility == "login" and isset($_SESSION['loginEmail']) or $fileVisibility == "user-specific" and (isset($_SESSION['user_permissions']) and preg_match("/allow_download/", $_SESSION['user_permissions'])) or !empty($fileVisibilityException) and preg_match($fileVisibilityException[1], $row[$fileVisibilityException[0]])) { if (!empty($row['file'])) { if (preg_match('#^(https?|ftp|file)://#i', $row['file'])) { $URLprefix = ""; // we don't alter the URL given in the 'file' field } else { // use the base URL of the standard files directory as prefix: if (preg_match('#^/#', $filesBaseURL)) { // absolute path -> file dir is located outside of refbase root dir $URLprefix = 'http://' . $_SERVER['HTTP_HOST'] . $filesBaseURL; } else { // relative path -> file dir is located within refbase root dir $URLprefix = $databaseBaseURL . $filesBaseURL; } } $printURL = true; } } // ---------------------------------------------------------- // Start OAI_DC XML record: if (!empty($metadataPrefix)) { $recordPrefix = $metadataPrefix . ":"; } $record = new XML($recordPrefix . "dc"); // create an XML object for a single record if ($addNameSpaceInfo) { if ($metadataPrefix == "oai_dc") { $record->setTagAttribute("xmlns:oai_dc", "http://www.openarchives.org/OAI/2.0/oai_dc/"); } elseif ($metadataPrefix == "srw_dc") { $record->setTagAttribute("xmlns:srw_dc", "info:srw/schema/1/dc-v1.1"); } $record->setTagAttribute("xmlns:dc", "http://purl.org/dc/elements/1.1/"); if ($metadataPrefix == "oai_dc") { $record->setTagAttribute("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"); $record->setTagAttribute("xsi:schemaLocation", "http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"); } elseif ($metadataPrefix == "srw_dc") { $record->setTagAttribute("xmlns:prism", "http://prismstandard.org/namespaces/1.2/basic/"); } } // Add Dublin Core elements: // NOTE: With a few exceptions, we try to adhere to the guidelines given at // "Using simple Dublin Core to describe eprints" by Andy Powell et al. // See: <http://eprints-uk.rdn.ac.uk/project/docs/simpledc-guidelines/> // - 'dc:title': if (!empty($row['title'])) { addMetaElement($record, "dc", "title", array(), $row['title']); } // function 'addMetaElement()' is defined in 'webservice.inc.php' // - 'dc:creator': if (!empty($row['author']) and $row['author'] != $row['editor']) { addMetaElement($record, "dc", "creator", array(), $row['author']); } // - 'dc:creator': // TODO: add refbase corporate author(s) as 'dc:creator' // - 'dc:contributor': if (!empty($row['editor'])) { addMetaElement($record, "dc", "contributor", array(), $row['editor']); } // - 'dc:description': if (!empty($row['abstract'])) { addMetaElement($record, "dc", "description", array(), $row['abstract']); } // - 'dc:identifier': // - DOI: if (!empty($row['doi'])) { addMetaElement($record, "dc", "identifier", array(), $row['doi'], "doi"); } // - PMID: if (!empty($row['notes']) and preg_match("/PMID *: *\\d+/i", $row['notes'])) { addMetaElement($record, "dc", "identifier", array(), $row['notes'], "pmid"); } // - arXiv: if (!empty($row['notes']) and preg_match("/arXiv *: *[^ ;]+/i", $row['notes'])) { addMetaElement($record, "dc", "identifier", array(), $row['notes'], "arxiv"); } // - ISBN: if (!empty($row['isbn'])) { addMetaElement($record, "dc", "identifier", array(), $row['isbn'], "isbn"); } // - OpenURL: addMetaElement($record, "dc", "identifier", array(), $openURL, "openurl"); // - refbase ID: addMetaElement($record, "dc", "identifier", array(), $databaseBaseURL . generateURL("show.php", "html", array("record" => $row['serial']), true), "url"); // - Cite key: addMetaElement($record, "dc", "identifier", array(), $citeKey, "citekey"); // - Bibliographic citation: // NOTE: In 'atomxml.inc.php', the bibliographic citation is put into a // 'dcterms:bibliographicCitation' element so that it can be uniquely // identified and extracted easily. However, in case of simple Dublin // Core output, we just put it into a 'dc:identifier' element and // use a "citation:" prefix. addMetaElement($record, "dc", "identifier", array(), encodeHTMLspecialchars($recordCitationPlain), "citation"); // - 'dc:source': // NOTE: - In <http://eprints-uk.rdn.ac.uk/project/docs/simpledc-guidelines/>, // Andy Powell et al. recommend that this element should NOT be used! // However, we use 'dc:source' elements for publication & series info // (publication/series title plus volume & issue) to provide a dedicated // source string that's easily readable and parsable. // Example: <dc:source>Polar Biology, Vol. 25, No. 10</dc:source> // - While we could also append the page info to the publication // 'dc:source' element, this info is more pertinent to the article // itself and is thus not included. For 'srw_dc:dc' output, page info is // included in PRISM elements (see below). // - All metadata (including the page info) are also provided as a machine // parsable citation in form of an OpenURL ContextObject (see above). // - Publication info: // NOTE: We only include the 'dc:source' element for 'oai_dc:dc' output. In case of 'srw_dc:dc' // output, we use the more fine-grained PRISM elements instead (see below) if ($metadataPrefix == "oai_dc" and (!empty($row['publication']) or !empty($row['abbrev_journal']))) { if (!empty($row['publication'])) { $source = $row['publication']; } elseif (!empty($row['abbrev_journal'])) { $source = $row['abbrev_journal']; } if (!empty($row['volume'])) { $source .= ", Vol. " . $row['volume']; } if (!empty($row['issue'])) { $source .= ", No. " . $row['issue']; } if (!empty($source)) { addMetaElement($record, "dc", "source", array(), $source); } } // - Series info: if (!empty($row['series_title']) or !empty($row['abbrev_series_title'])) { if (!empty($row['series_title'])) { $series = $row['series_title']; } elseif (!empty($row['abbrev_series_title'])) { $series = $row['abbrev_series_title']; } if (!empty($row['series_volume'])) { $series .= ", Vol. " . $row['series_volume']; } if (!empty($row['series_issue'])) { $series .= ", No. " . $row['series_issue']; } if (!empty($series)) { addMetaElement($record, "dc", "source", array(), $series); } // NOTE: To distinguish between regular publication & series info, // should we better use a "series:" prefix here? If so, use: // addMetaElement($record, "dc", "source", array(), $series, "series"); } // - ISSN: // NOTE: for 'srw_dc:dc' output, we put the ISSN into the 'prism:issn' element if ($metadataPrefix == "oai_dc" and !empty($row['issn'])) { addMetaElement($record, "dc", "source", array(), $row['issn'], "issn"); } // - 'dc:date': if (!empty($row['year'])) { addMetaElement($record, "dc", "date", array(), $row['year']); } // - 'dc:type': if (!empty($row['type'])) { addMetaElement($record, "dc", "type", array(), $row['type'], $row['thesis']); } // In case of a thesis, we add another 'dc:type' element with the actual thesis type: if (!empty($row['thesis'])) { addMetaElement($record, "dc", "type", array(), $row['thesis']); } // - 'dc:format': // TODO: ideally, we should parse the content of the refbase 'medium' field and map it // to a media-type term from <http://www.iana.org/assignments/media-types/> if (!empty($row['medium'])) { $mediaType = $row['medium']; } else { $mediaType = "text"; } addMetaElement($record, "dc", "format", array(), $mediaType); // - 'dc:subject': // TODO: add user-specific keywords (from field 'user_keys') if the user is logged in if (!empty($row['keywords'])) { addMetaElement($record, "dc", "subject", array(), $row['keywords']); } // - 'dc:coverage': // TODO: should we add contents from the refbase 'area' field as 'dc:coverage' element(s)? // - 'dc:relation': // - Related URL: if (!empty($row['url'])) { addMetaElement($record, "dc", "relation", array(), $row['url'], "url"); } // - Related FILE: if ($printURL) { addMetaElement($record, "dc", "relation", array(), $URLprefix . $row['file'], "file"); } // - 'dc:publisher': if (!empty($row['publisher'])) { addMetaElement($record, "dc", "publisher", array(), $row['publisher']); } // - 'dc:language': // TODO: convert to ISO notation (i.e. "en" instead of "English", etc) if (!empty($row['language'])) { addMetaElement($record, "dc", "language", array(), $row['language']); } // ---------------------------------------------------------- // Add PRISM elements: // NOTE: When using the 'srw_dc' namespace (i.e. 'info:srw/schema/1/dc-v1.1' as detailed at // <http://www.loc.gov/standards/sru/resources/dc-schema.html>), I don't think it's allowed // to include anything but the fifteen elements from simple Dublin Core. Is this correct? // If so, then: // // TODO: Do we need to put the PRISM elements in <extraRecordData> instead? Or can we put them within // a separate branch outside of (and next to) the '<srw_dc:dc>' element? Or shall we better omit // them entirely? // More info on SRU Extra Data>: <http://www.loc.gov/standards/sru/specs/extra-data.html> // // See also "Mixing DC metadata with other metadata schemas" in "Guidelines for implementing // Dublin Core in XML" <http://dublincore.org/documents/dc-xml-guidelines/> if ($metadataPrefix == "srw_dc") { // - 'prism:issn': if (!empty($row['issn'])) { addMetaElement($record, "prism", "issn", array(), $row['issn']); } // - 'prism:publicationName': if (!empty($row['publication'])) { addMetaElement($record, "prism", "publicationName", array(), $row['publication']); } elseif (!empty($row['abbrev_journal'])) { addMetaElement($record, "prism", "publicationName", array(), $row['abbrev_journal']); } // - 'prism:publicationDate': if (!empty($row['year'])) { addMetaElement($record, "prism", "publicationDate", array(), $row['year']); } // - 'prism:volume': if (!empty($row['volume'])) { addMetaElement($record, "prism", "volume", array(), $row['volume']); } // - 'prism:number': if (!empty($row['issue'])) { addMetaElement($record, "prism", "number", array(), $row['issue']); } // - 'prism:startingPage', 'prism:endingPage': // TODO: Similar code is used in 'include.in.php', 'modsxml.inc.php' and 'openurl.inc.php', // so this should be made into a dedicated function! if (!empty($row['pages']) and preg_match("/\\d+/i", $row['pages'])) { $pages = preg_replace("/^\\D*(\\d+)( *[{$dash}]+ *\\d+)?.*/i{$patternModifiers}", "\\1\\2", $row['pages']); // extract page range (if there's any), otherwise just the first number $startPage = preg_replace("/^\\D*(\\d+).*/i", "\\1", $row['pages']); // extract starting page $endPage = extractDetailsFromField("pages", $pages, "/\\D+/", "[-1]"); // extract ending page (function 'extractDetailsFromField()' is defined in 'include.inc.php') // NOTE: To extract the ending page, we'll use function 'extractDetailsFromField()' // instead of just grabbing a matched regex pattern since it'll also work // when just a number but no range is given (e.g. when startPage = endPage) // - 'prism:startingPage': if (preg_match("/\\d+ *[{$dash}]+ *\\d+/i{$patternModifiers}", $row['pages'])) { // if there's a page range addMetaElement($record, "prism", "startingPage", array(), $startPage); } // - 'prism:endingPage': addMetaElement($record, "prism", "endingPage", array(), $endPage); } } return $record; }