/** * @covers String::trimPunctuation */ public function testTrimPunctuation() { $trimmedChars = array(' ', ',', '.', ';', ':', '!', '?', '(', ')', '[', ']', '\\', '/'); foreach ($trimmedChars as $trimmedChar) { self::assertEquals('trim.med', String::trimPunctuation($trimmedChar . 'trim.med' . $trimmedChar)); } }
/** * @see Filter::process() * @param $isbn string * @return MetadataDescription a looked up citation description * or null if the filter fails */ function &process($isbn) { $nullVar = null; // Instantiate the web service request $lookupParams = array('access_key' => $this->getApiKey(), 'index1' => 'isbn', 'results' => 'details,authors', 'value1' => $isbn); // Call the web service if (is_null($resultDOM =& $this->callWebService(ISBNDB_WEBSERVICE_URL, $lookupParams))) { return $nullVar; } // Transform and pre-process the web service result if (is_null($metadata =& $this->transformWebServiceResults($resultDOM, dirname(__FILE__) . DIRECTORY_SEPARATOR . 'isbndb.xsl'))) { return $nullVar; } // Extract place and publisher from the combined entry. $metadata['publisher-loc'] = String::trimPunctuation(String::regexp_replace('/^(.+):.*/', '\\1', $metadata['place-publisher'])); $metadata['publisher-name'] = String::trimPunctuation(String::regexp_replace('/.*:([^,]+),?.*/', '\\1', $metadata['place-publisher'])); unset($metadata['place-publisher']); // Reformat the publication date $metadata['date'] = String::regexp_replace('/^[^\\d{4}]+(\\d{4}).*/', '\\1', $metadata['date']); // Clean non-numerics from ISBN $metadata['isbn'] = String::regexp_replace('/[^\\dX]*/', '', $isbn); // Set the publicationType $metadata['[@publication-type]'] = NLM_PUBLICATION_TYPE_BOOK; return $this->getNlmCitationDescriptionFromMetadataArray($metadata); }
/** * Converts a string with a single person * to an NLM name description. * * TODO: add initials from all given names to initials * element * * @param $personString string * @param $title boolean true to parse for title * @param $degrees boolean true to parse for degrees * @return MetadataDescription an NLM name description or null * if the string could not be converted */ function &_parsePersonString($personString, $title, $degrees) { // Expressions to parse person strings, ported from CiteULike person // plugin, see http://svn.citeulike.org/svn/plugins/person.tcl static $personRegex = array('title' => '(?:His (?:Excellency|Honou?r)\\s+|Her (?:Excellency|Honou?r)\\s+|The Right Honou?rable\\s+|The Honou?rable\\s+|Right Honou?rable\\s+|The Rt\\.? Hon\\.?\\s+|The Hon\\.?\\s+|Rt\\.? Hon\\.?\\s+|Mr\\.?\\s+|Ms\\.?\\s+|M\\/s\\.?\\s+|Mrs\\.?\\s+|Miss\\.?\\s+|Dr\\.?\\s+|Sir\\s+|Dame\\s+|Prof\\.?\\s+|Professor\\s+|Doctor\\s+|Mister\\s+|Mme\\.?\\s+|Mast(?:\\.|er)?\\s+|Lord\\s+|Lady\\s+|Madam(?:e)?\\s+|Priv\\.-Doz\\.\\s+)+', 'degrees' => '(,\\s+(?:[A-Z\\.]+))+', 'initials' => '(?:(?:[A-Z]\\.){1,3}[A-Z]\\.?)|(?:(?:[A-Z]\\.\\s){1,3}[A-Z]\\.?)|(?:[A-Z]{1,4})|(?:(?:[A-Z]\\.-?){1,4})|(?:(?:[A-Z]\\.-?){1,3}[A-Z]\\.?)|(?:(?:[A-Z]-){1,3}[A-Z])|(?:(?:[A-Z]\\s){1,3}[A-Z]\\.?)|(?:(?:[A-Z]-){1,3}[A-Z]\\.?)', 'prefix' => 'Dell(?:[a|e])?(?:\\s|$)|Dalle(?:\\s|$)|D[a|e]ll\'(?:\\s|$)|Dela(?:\\s|$)|Del(?:\\s|$)|[Dd]e(?:\\s|$)(?:La(?:\\s|$)|Los(?:\\s|$))?|[Dd]e(?:\\s|$)|[Dd][a|i|u](?:\\s|$)|L[a|e|o](?:\\s|$)|[D|L|O]\'|St\\.?(?:\\s|$)|San(?:\\s|$)|[Dd]en(?:\\s|$)|[Vv]on(?:\\s|$)(?:[Dd]er(?:\\s|$))?|(?:[Ll][ea](?:\\s|$))?[Vv]an(?:\\s|$)(?:[Dd]e(?:n|r)?(?:\\s|$))?', 'givenName' => '(?:[^ \\t\\n\\r\\f\\v,.;()]{2,}|[^ \\t\\n\\r\\f\\v,.;()]{2,}\\-[^ \\t\\n\\r\\f\\v,.;()]{2,})'); // The expressions for given name, suffix and surname are the same $personRegex['surname'] = $personRegex['suffix'] = $personRegex['givenName']; $personRegex['double-surname'] = "(?:" . $personRegex['surname'] . "\\s)*" . $personRegex['surname']; // Shortcut for prefixed surname $personRegexPrefixedSurname = "(?P<prefix>(?:" . $personRegex['prefix'] . ")?)(?P<surname>" . $personRegex['surname'] . ")"; $personRegexPrefixedDoubleSurname = "(?P<prefix>(?:" . $personRegex['prefix'] . ")?)(?P<surname>" . $personRegex['double-surname'] . ")"; // Instantiate the target person description $personDescription = new MetadataDescription('lib.pkp.plugins.metadata.nlm30.schema.Nlm30NameSchema', $this->_assocType); // Clean the person string $personString = trim($personString); // 1. Extract title and degree from the person string and use this as suffix $suffixString = ''; $results = array(); if ($title && String::regexp_match_get('/^(' . $personRegex['title'] . ')/i', $personString, $results)) { $suffixString = trim($results[1], ',:; '); $personString = String::regexp_replace('/^(' . $personRegex['title'] . ')/i', '', $personString); } if ($degrees && String::regexp_match_get('/(' . $personRegex['degrees'] . ')$/i', $personString, $results)) { $degreesArray = explode(',', trim($results[1], ',')); foreach ($degreesArray as $key => $degree) { $degreesArray[$key] = String::trimPunctuation($degree); } $suffixString .= ' - ' . implode('; ', $degreesArray); $personString = String::regexp_replace('/(' . $personRegex['degrees'] . ')$/i', '', $personString); } if (!empty($suffixString)) { $personDescription->addStatement('suffix', $suffixString); } // Space initials when followed by a given name or last name. $personString = String::regexp_replace('/([A-Z])\\.([A-Z][a-z])/', '\\1. \\2', $personString); // 2. Extract names and initials from the person string // The parser expressions are ordered by specificity. The most specific expressions // come first. Only if these specific expressions don't work will we turn to less // specific ones. This avoids parsing errors. It also explains why we don't use the // ?-quantifier for optional elements like initials or middle name where they could // be misinterpreted. $personExpressions = array('/^' . $personRegexPrefixedSurname . '$/i', '/^(?P<initials>' . $personRegex['initials'] . ')\\s' . $personRegexPrefixedSurname . '$/', '/^' . $personRegexPrefixedSurname . ',?\\s(?P<initials>' . $personRegex['initials'] . ')$/', '/^' . $personRegexPrefixedDoubleSurname . ',\\s(?P<givenName>' . $personRegex['givenName'] . ')\\s(?P<initials>' . $personRegex['initials'] . ')$/', '/^(?P<givenName>' . $personRegex['givenName'] . ')\\s(?P<initials>' . $personRegex['initials'] . ')\\s' . $personRegexPrefixedSurname . '$/', '/^' . $personRegexPrefixedDoubleSurname . ',\\s(?P<givenName>(?:' . $personRegex['givenName'] . '\\s)+)(?P<initials>' . $personRegex['initials'] . ')$/', '/^(?P<givenName>(?:' . $personRegex['givenName'] . '\\s)+)(?P<initials>' . $personRegex['initials'] . ')\\s' . $personRegexPrefixedSurname . '$/', '/^' . $personRegexPrefixedDoubleSurname . ',(?P<givenName>(?:\\s' . $personRegex['givenName'] . ')+)$/', '/^(?P<givenName>(?:' . $personRegex['givenName'] . '\\s)+)' . $personRegexPrefixedSurname . '$/', '/^\\s*(?P<surname>' . $personRegex['surname'] . ')(?P<suffix>(?:\\s+' . $personRegex['suffix'] . ')?)\\s*,\\s*(?P<initials>(?:' . $personRegex['initials'] . ')?)\\s*\\((?P<givenName>(?:\\s*' . $personRegex['givenName'] . ')+)\\s*\\)\\s*(?P<prefix>(?:' . $personRegex['prefix'] . ')?)$/', '/^(?P<givenName>' . $personRegex['givenName'] . ')\\.(?P<surname>' . $personRegex['double-surname'] . ')$/', '/^(?P<surname>.*)$/'); $results = array(); foreach ($personExpressions as $expressionId => $personExpression) { if ($nameFound = String::regexp_match_get($personExpression, $personString, $results)) { // Given names if (!empty($results['givenName'])) { // Split given names $givenNames = explode(' ', trim($results['givenName'])); foreach ($givenNames as $givenName) { $personDescription->addStatement('given-names', $givenName); unset($givenName); } } // Initials (will also be saved as given names) if (!empty($results['initials'])) { $results['initials'] = str_replace(array('.', '-', ' '), array('', '', ''), $results['initials']); for ($initialNum = 0; $initialNum < String::strlen($results['initials']); $initialNum++) { $initial = $results['initials'][$initialNum]; $personDescription->addStatement('given-names', $initial); unset($initial); } } // Surname if (!empty($results['surname'])) { // Correct all-upper surname if (strtoupper($results['surname']) == $results['surname']) { $results['surname'] = ucwords(strtolower($results['surname'])); } $personDescription->addStatement('surname', $results['surname']); } // Prefix/Suffix foreach (array('prefix', 'suffix') as $propertyName) { if (!empty($results[$propertyName])) { $results[$propertyName] = trim($results[$propertyName]); $personDescription->addStatement($propertyName, $results[$propertyName]); } } break; } } return $personDescription; }
/** * @see Filter::process() * @param $citationString string * @return MetadataDescription */ function &process($citationString) { $nullVar = null; // Check the availability of perl $perlCommand = Config::getVar('cli', 'perl'); if (empty($perlCommand) || !file_exists($perlCommand)) { return $nullVar; } // Convert to ASCII - Paracite doesn't handle UTF-8 well $citationString = String::utf8_to_ascii($citationString); // Call the paracite parser $wrapperScript = dirname(__FILE__) . DIRECTORY_SEPARATOR . 'paracite.pl'; $paraciteCommand = $perlCommand . ' ' . escapeshellarg($wrapperScript) . ' ' . $this->getCitationModule() . ' ' . escapeshellarg($citationString); $xmlResult = shell_exec($paraciteCommand); if (empty($xmlResult)) { return $nullVar; } if (Config::getVar('i18n', 'charset_normalization') == 'On' && !String::utf8_compliant($xmlResult)) { $xmlResult = String::utf8_normalize($xmlResult); } // Create a temporary DOM document $resultDOM = new DOMDocument(); $resultDOM->recover = true; $resultDOM->loadXML($xmlResult); // Extract the parser results as an array $xmlHelper = new XMLHelper(); $metadata = $xmlHelper->xmlToArray($resultDOM->documentElement); // We have to merge subtitle and title as neither OpenURL // nor NLM can handle subtitles. if (isset($metadata['subtitle'])) { $metadata['title'] .= '. ' . $metadata['subtitle']; unset($metadata['subtitle']); } // Break up the authors field if (isset($metadata['authors'])) { $metadata['authors'] = String::trimPunctuation($metadata['authors']); $metadata['authors'] = String::iterativeExplode(array(':', ';'), $metadata['authors']); } // Convert pages to integers foreach (array('spage', 'epage') as $pageProperty) { if (isset($metadata[$pageProperty])) { $metadata[$pageProperty] = (int) $metadata[$pageProperty]; } } // Convert titles to title case foreach (array('title', 'chapter', 'publication') as $titleProperty) { if (isset($metadata[$titleProperty])) { $metadata[$titleProperty] = String::titleCase($metadata[$titleProperty]); } } // Map ParaCite results to OpenURL - null means // throw the value away. $metadataMapping = array('genre' => 'genre', '_class' => null, 'any' => null, 'authors' => 'au', 'aufirst' => 'aufirst', 'aufull' => null, 'auinit' => 'auinit', 'aulast' => 'aulast', 'atitle' => 'atitle', 'cappublication' => null, 'captitle' => null, 'date' => 'date', 'epage' => 'epage', 'featureID' => null, 'id' => null, 'issue' => 'issue', 'jnl_epos' => null, 'jnl_spos' => null, 'match' => null, 'marked' => null, 'num_of_fig' => null, 'pages' => 'pages', 'publisher' => 'pub', 'publoc' => 'place', 'ref' => null, 'rest_text' => null, 'spage' => 'spage', 'targetURL' => 'url', 'text' => null, 'ucpublication' => null, 'uctitle' => null, 'volume' => 'volume', 'year' => 'date'); // Ignore 'year' if 'date' is set if (isset($metadata['date'])) { $metadataMapping['year'] = null; } // Set default genre if (empty($metadata['genre'])) { $metadata['genre'] = OPENURL_GENRE_ARTICLE; } // Handle title, chapter and publication depending on // the (inferred) genre. Also instantiate the target schema. switch ($metadata['genre']) { case OPENURL_GENRE_BOOK: case OPENURL_GENRE_BOOKITEM: case OPENURL_GENRE_REPORT: case OPENURL_GENRE_DOCUMENT: $metadataMapping += array('publication' => 'btitle', 'chapter' => 'atitle'); if (isset($metadata['title'])) { if (!isset($metadata['publication'])) { $metadata['publication'] = $metadata['title']; } elseif (!isset($metadata['chapter'])) { $metadata['chapter'] = $metadata['title']; } unset($metadata['title']); } $openUrlSchemaName = 'lib.pkp.classes.metadata.openurl.OpenUrlBookSchema'; $openUrlSchemaClass = 'OpenUrlBookSchema'; break; case OPENURL_GENRE_ARTICLE: case OPENURL_GENRE_JOURNAL: case OPENURL_GENRE_ISSUE: case OPENURL_GENRE_CONFERENCE: case OPENURL_GENRE_PROCEEDING: case OPENURL_GENRE_PREPRINT: default: $metadataMapping += array('publication' => 'jtitle'); if (isset($metadata['title'])) { if (!isset($metadata['publication'])) { $metadata['publication'] = $metadata['title']; } elseif (!isset($metadata['atitle'])) { $metadata['atitle'] = $metadata['title']; } unset($metadata['title']); } $openUrlSchemaName = 'lib.pkp.classes.metadata.openurl.OpenUrlJournalSchema'; $openUrlSchemaClass = 'OpenUrlJournalSchema'; break; } // Instantiate an OpenURL description $openUrlDescription = new MetadataDescription($openUrlSchemaName, ASSOC_TYPE_CITATION); $openUrlSchema = new $openUrlSchemaClass(); // Map the ParaCite result to OpenURL foreach ($metadata as $paraciteElementName => $paraciteValue) { if (!empty($paraciteValue)) { // Trim punctuation if (is_string($paraciteValue)) { $paraciteValue = String::trimPunctuation($paraciteValue); } // Transfer the value to the OpenURL result array assert(array_key_exists($paraciteElementName, $metadataMapping)); $openUrlPropertyName = $metadataMapping[$paraciteElementName]; if (!is_null($openUrlPropertyName) && $openUrlSchema->hasProperty($openUrlPropertyName)) { if (is_array($paraciteValue)) { foreach ($paraciteValue as $singleValue) { $success = $openUrlDescription->addStatement($openUrlPropertyName, $singleValue); assert($success); } } else { $success = $openUrlDescription->addStatement($openUrlPropertyName, $paraciteValue); assert($success); } } } } // Crosswalk to NLM $crosswalkFilter = new OpenUrlNlmCitationSchemaCrosswalkFilter(); $nlmDescription =& $crosswalkFilter->execute($openUrlDescription); assert(is_a($nlmDescription, 'MetadataDescription')); // Add 'rest_text' as NLM comment (if given) if (isset($metadata['rest_text'])) { $nlmDescription->addStatement('comment', String::trimPunctuation($metadata['rest_text'])); } // Set display name and sequence id in the meta-data description // to the corresponding values from the filter. This is important // so that we later know which result came from which filter. $nlmDescription->setDisplayName($this->getDisplayName()); $nlmDescription->setSeq($this->getSeq()); return $nlmDescription; }
/** * Fills the given citation object with * meta-data retrieved from PubMed. * @param $pmid string * @param $citationDescription MetadataDescription * @return MetadataDescription */ function &_lookup($pmid, &$citationDescription) { $nullVar = null; // Use eFetch to get XML metadata for the given PMID $lookupParams = array('db' => 'pubmed', 'mode' => 'xml', 'tool' => 'pkp-wal', 'id' => $pmid); if (!is_null($this->getEmail())) { $lookupParams['email'] = $this->getEmail(); } // Call the eFetch URL and get an XML result if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) { return $nullVar; } $metadata = array('pub-id[@pub-id-type="pmid"]' => $pmid, 'article-title' => $resultDOM->getElementsByTagName("ArticleTitle")->item(0)->textContent, 'source' => $resultDOM->getElementsByTagName("MedlineTA")->item(0)->textContent); if ($resultDOM->getElementsByTagName("Volume")->length > 0) { $metadata['volume'] = $resultDOM->getElementsByTagName("Volume")->item(0)->textContent; } if ($resultDOM->getElementsByTagName("Issue")->length > 0) { $metadata['issue'] = $resultDOM->getElementsByTagName("Issue")->item(0)->textContent; } // get list of author full names $nlmNameSchema = new NlmNameSchema(); foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) { if (!isset($metadata['person-group[@person-group-type="author"]'])) { $metadata['person-group[@person-group-type="author"]'] = array(); } // Instantiate an NLM name description $authorDescription = new MetadataDescription($nlmNameSchema, ASSOC_TYPE_AUTHOR); // Surname $authorDescription->addStatement('surname', $authorNode->getElementsByTagName("LastName")->item(0)->textContent); // Given names $givenNamesString = ''; if ($authorNode->getElementsByTagName("FirstName")->length > 0) { $givenNamesString = $authorNode->getElementsByTagName("FirstName")->item(0)->textContent; } elseif ($authorNode->getElementsByTagName("ForeName")->length > 0) { $givenNamesString = $authorNode->getElementsByTagName("ForeName")->item(0)->textContent; } if (!empty($givenNamesString)) { foreach (explode(' ', $givenNamesString) as $givenName) { $authorDescription->addStatement('given-names', String::trimPunctuation($givenName)); } } // Suffix if ($authorNode->getElementsByTagName("Suffix")->length > 0) { $authorDescription->addStatement('suffix', $authorNode->getElementsByTagName("Suffix")->item(0)->textContent); } // Include collective names /*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') { // FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation }*/ $metadata['person-group[@person-group-type="author"]'][] =& $authorDescription; unset($authorDescription); } // Extract pagination if (String::regexp_match_get("/^[:p\\.\\s]*(?P<fpage>[Ee]?\\d+)(-(?P<lpage>\\d+))?/", $resultDOM->getElementsByTagName("MedlinePgn")->item(0)->textContent, $pages)) { $fPage = (int) $pages['fpage']; $metadata['fpage'] = $fPage; if (!empty($pages['lpage'])) { $lPage = (int) $pages['lpage']; // Deal with shortcuts like '382-7' if ($lPage < $fPage) { $lPage = (int) (String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])) . $pages['lpage']); } $metadata['lpage'] = $lPage; } } // Get publication date // TODO: The publication date could be in multiple places if ($resultDOM->getElementsByTagName("ArticleDate")->length > 0) { $publicationDate = $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Year")->item(0)->textContent . '-' . $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Month")->item(0)->textContent . '-' . $resultDOM->getElementsByTagName("ArticleDate")->item(0)->getElementsByTagName("Day")->item(0)->textContent; $metadata['date'] = $publicationDate; } // Get publication type if ($resultDOM->getElementsByTagName("PublicationType")->length > 0) { foreach ($resultDOM->getElementsByTagName("PublicationType") as $publicationType) { // The vast majority of items on PubMed are articles so catch these... if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) { $metadata['[@publication-type]'] = NLM_PUBLICATION_TYPE_JOURNAL; break; } } } // Get DOI if it exists foreach ($resultDOM->getElementsByTagName("ArticleId") as $idNode) { if ($idNode->getAttribute('IdType') == 'doi') { $metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent; } } // Use eLink utility to find fulltext links $lookupParams = array('dbfrom' => 'pubmed', 'cmd' => 'llinks', 'tool' => 'pkp-wal', 'id' => $pmid); if (!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) { // Get a list of possible links foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) { $attributes = ''; foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) { $attributes .= String::strtolower($attribute->textContent) . ' / '; } // Only add links to open access resources if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false && String::strpos($attributes, "fee") === false && $attributes != "") { $links[] = $linkOut->getElementsByTagName("Url")->item(0)->textContent; } } // Take the first link if we have any left (presumably pubmed returns them in preferential order) if (isset($links[0])) { $metadata['uri'] = $links[0]; } } return $this->addMetadataArrayToNlmCitationDescription($metadata, $citationDescription); }
/** * @see Filter::process() * @param $citationString string * @return MetadataDescription */ function &process($citationString) { // Initialize the parser result array $matches = array(); $metadata = array(); // Parse out any embedded URLs $urlPattern = '(<?(https?://([-\\w\\.]+)+(:\\d+)?(/([\\w/_\\.,]*(\\?[^\\s>]+)?)?)?)>?)'; if (String::regexp_match_get($urlPattern, $citationString, $matches)) { // Assume that the URL is a link to the resource. $metadata['uri'] = $matches[1]; // Remove the URL from the citation string $citationString = String::regexp_replace($urlPattern, '', $citationString); // If the URL is a link to PubMed, save the PMID $pmIdExpressions = array('/list_uids=(?P<pmId>\\d+)/i', '/pubmed.*details_term=(?P<pmId>\\d+)/i', '/pubmedid=(?P<pmId>\\d+)/i'); foreach ($pmIdExpressions as $pmIdExpression) { if (String::regexp_match_get($pmIdExpression, $matches[1], $pmIdMatches)) { $metadata['pub-id[@pub-id-type="pmid"]'] = $pmIdMatches['pmId']; break; } } } // Parse out an embedded PMID and remove from the citation string $pmidPattern = '/pmid:?\\s*(\\d+)/i'; if (String::regexp_match_get($pmidPattern, $citationString, $matches)) { $metadata['pub-id[@pub-id-type="pmid"]'] = $matches[1]; $citationString = String::regexp_replace($pmidPattern, '', $citationString); } // Parse out an embedded DOI and remove it from the citation string $doiPattern = '/doi:?\\s*(\\S+)/i'; if (String::regexp_match_get($doiPattern, $citationString, $matches)) { $metadata['pub-id[@pub-id-type="doi"]'] = $matches[1]; $citationString = String::regexp_replace($doiPattern, '', $citationString); } // Parse out the access date if we have one and remove it from the citation string $accessDatePattern = '/accessed:?\\s*([\\s\\w]+)/i'; if (String::regexp_match_get($accessDatePattern, $citationString, $matches)) { $metadata['access-date'] = $matches[1]; $citationString = String::regexp_replace($accessDatePattern, '', $citationString); } // Clean out square brackets $citationString = String::regexp_replace('/\\[(\\s*(pubmed|medline|full text)\\s*)*]/i', '', $citationString); // Book citation $unparsedTail = ''; if (String::regexp_match_get("/\\s*(?P<authors>[^\\.]+)\\.\\s*(?P<source>.*?)\\s*(?P<publisherLoc>[^\\.]*):\\s*(?P<publisherName>[^:]*?);\\s*(?P<date>\\d\\d\\d\\d.*?)(?P<tail>.*)/", $citationString, $matches)) { $metadata['[@publication-type]'] = NLM30_PUBLICATION_TYPE_BOOK; $metadata['author'] = $matches['authors']; $metadata['source'] = $matches['source']; $metadata['publisher-loc'] = $matches['publisherLoc']; $metadata['publisher-name'] = $matches['publisherName']; $metadata['date'] = $matches['date']; $unparsedTail = $matches['tail']; // Journal citation } elseif (String::regexp_match_get("/\\s*(?P<authors>[^\\.]+)\\.\\s*(?P<titleSource>.*)\\s*(?P<date>\\d\\d\\d\\d.*?);(?P<volumeAndIssue>[^:]+):(?P<tail>.*)/", $citationString, $matches)) { $metadata['[@publication-type]'] = NLM30_PUBLICATION_TYPE_JOURNAL; $metadata['author'] = $matches['authors']; $titleSource = array(); if (String::regexp_match_get("/(.*[\\.!\\?])(.*)/", trim($matches['titleSource'], " ."), $titleSource)) { $metadata['article-title'] = $titleSource[1]; $metadata['source'] = $titleSource[2]; } $metadata['date'] = $matches['date']; $volumeAndIssue = array(); if (String::regexp_match_get("/([^\\(]+)(\\(([^\\)]+)\\))?/", $matches['volumeAndIssue'], $volumeAndIssue)) { $metadata['volume'] = $volumeAndIssue[1]; if (isset($volumeAndIssue[3])) { $metadata['issue'] = $volumeAndIssue[3]; } } $unparsedTail = $matches['tail']; // Web citation with or without authors } elseif (String::regexp_match_get("/\\s*(?P<citationSource>.*?)\\s*URL:\\s*(?P<tail>.*)/", $citationString, $matches)) { $unparsedTail = $matches['tail']; $citationParts = explode(".", trim($matches['citationSource'], '. ')); switch (count($citationParts)) { case 0: // This case should never occur... assert(false); break; case 1: // Assume this to be a title for the web site. $metadata['article-title'] = $citationParts[0]; break; case 2: // Assume the format: Authors. Title. $metadata['author'] = $citationParts[0]; $metadata['article-title'] = $citationParts[1]; break; default: // Assume the format: Authors. Article Title. Journal Title. $metadata['author'] = array_shift($citationParts); // The last part is assumed to be the journal title $metadata['source'] = array_pop($citationParts); // Everything in between is assumed to belong to the article title $metadata['article-title'] = implode('.', $citationParts); } } // TODO: Handle in-ref titles, eg. with editor lists // Extract page numbers if possible $pagesPattern = "/^[:p\\.\\s]*(?P<fpage>[Ee]?\\d+)(-(?P<lpage>\\d+))?/"; if (!empty($unparsedTail) && String::regexp_match_get($pagesPattern, $unparsedTail, $matches)) { $metadata['fpage'] = $matches['fpage']; if (isset($matches['lpage'])) { $metadata['lpage'] = $matches['lpage']; } // Add the unparsed part of the citation string as a comment so it doesn't get lost. $comment = String::trimPunctuation(String::regexp_replace($pagesPattern, '', $unparsedTail)); if (!empty($comment)) { $metadata['comment'] = $comment; } } // Make the meta-data fully NLM citation compliant $metadata =& $this->postProcessMetadataArray($metadata); // Create the NLM citation description return $this->getNlm30CitationDescriptionFromMetadataArray($metadata); }
/** * Recursively trim punctuation from a metadata array. * @param $metadataArray array */ function &_recursivelyTrimPunctuation(&$metadataArray) { assert(is_array($metadataArray)); foreach ($metadataArray as $metadataKey => $metadataValue) { // If we find an array then we'll recurse if (is_array($metadataValue)) { $metadataArray[$metadataKey] = $this->_recursivelyTrimPunctuation($metadataValue); } // String scalars will be trimmed if (is_string($metadataValue)) { $metadataArray[$metadataKey] = String::trimPunctuation($metadataValue); } // All other value types (i.e. integers, composite values, etc.) // will be ignored. } return $metadataArray; }
/** * Fills the given citation object with * meta-data retrieved from PubMed. * @param $pmid string * @return MetadataDescription */ function &_lookup($pmid) { $nullVar = null; // Use eFetch to get XML metadata for the given PMID $lookupParams = array('db' => 'pubmed', 'mode' => 'xml', 'tool' => 'pkp-wal', 'id' => $pmid); if (!is_null($this->getEmail())) { $lookupParams['email'] = $this->getEmail(); } // Call the eFetch URL and get an XML result if (is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_EFETCH, $lookupParams))) { return $nullVar; } $articleTitleNodes =& $resultDOM->getElementsByTagName("ArticleTitle"); $articleTitleFirstNode =& $articleTitleNodes->item(0); $medlineTaNodes =& $resultDOM->getElementsByTagName("MedlineTA"); $medlineTaFirstNode =& $medlineTaNodes->item(0); $metadata = array('pub-id[@pub-id-type="pmid"]' => $pmid, 'article-title' => $articleTitleFirstNode->textContent, 'source' => $medlineTaFirstNode->textContent); $volumeNodes =& $resultDOM->getElementsByTagName("Volume"); $issueNodes =& $resultDOM->getElementsByTagName("Issue"); if ($volumeNodes->length > 0) { $volumeFirstNode =& $volumeNodes->item(0); } $metadata['volume'] = $volumeFirstNode->textContent; if ($issueNodes->length > 0) { $issueFirstNode =& $issueNodes->item(0); } $metadata['issue'] = $issueFirstNode->textContent; // Get list of author full names foreach ($resultDOM->getElementsByTagName("Author") as $authorNode) { if (!isset($metadata['person-group[@person-group-type="author"]'])) { $metadata['person-group[@person-group-type="author"]'] = array(); } // Instantiate an NLM name description $authorDescription = new MetadataDescription('lib.pkp.plugins.metadata.nlm30.schema.Nlm30NameSchema', ASSOC_TYPE_AUTHOR); // Surname $lastNameNodes =& $authorNode->getElementsByTagName("LastName"); $lastNameFirstNode =& $lastNameNodes->item(0); $authorDescription->addStatement('surname', $lastNameFirstNode->textContent); // Given names $givenNamesString = ''; $firstNameNodes =& $authorNode->getElementsByTagName("FirstName"); if ($firstNameNodes->length > 0) { $firstNameFirstNode =& $firstNameNodes->item(0); $givenNamesString = $firstNameFirstNode->textContent; } else { $foreNameNodes =& $authorNode->getElementsByTagName("ForeName"); if ($foreNameNodes->length > 0) { $foreNameFirstNode =& $foreNameNodes->item(0); $givenNamesString = $foreNameFirstNode->textContent; } } if (!empty($givenNamesString)) { foreach (explode(' ', $givenNamesString) as $givenName) { $authorDescription->addStatement('given-names', String::trimPunctuation($givenName)); } } // Suffix $suffixNodes =& $authorNode->getElementsByTagName("Suffix"); if ($suffixNodes->length > 0) { $suffixFirstNode =& $suffixNodes->item(0); $authorDescription->addStatement('suffix', $suffixFirstNode->textContent); } // Include collective names // FIXME: This corresponds to an NLM-citation <collab> tag and should be part of the Metadata implementation /*if ($resultDOM->getElementsByTagName("CollectiveName")->length > 0 && $authorNode->getElementsByTagName("CollectiveName")->item(0)->textContent != '') { }*/ $metadata['person-group[@person-group-type="author"]'][] =& $authorDescription; unset($authorDescription); } // Extract pagination $medlinePgnNodes =& $resultDOM->getElementsByTagName("MedlinePgn"); $medlinePgnFirstNode =& $medlinePgnNodes->item(0); if (String::regexp_match_get("/^[:p\\.\\s]*(?P<fpage>[Ee]?\\d+)(-(?P<lpage>\\d+))?/", $medlinePgnFirstNode->textContent, $pages)) { $fPage = (int) $pages['fpage']; $metadata['fpage'] = $fPage; if (!empty($pages['lpage'])) { $lPage = (int) $pages['lpage']; // Deal with shortcuts like '382-7' if ($lPage < $fPage) { $lPage = (int) (String::substr($pages['fpage'], 0, -String::strlen($pages['lpage'])) . $pages['lpage']); } $metadata['lpage'] = $lPage; } } // Get publication date (can be in several places in PubMed). $dateNode = null; $articleDateNodes =& $resultDOM->getElementsByTagName("ArticleDate"); if ($articleDateNodes->length > 0) { $dateNode =& $articleDateNodes->item(0); } else { $pubDateNodes =& $resultDOM->getElementsByTagName("PubDate"); if ($pubDateNodes->length > 0) { $dateNode =& $pubDateNodes->item(0); } } // Retrieve the data parts and assemble date. if (!is_null($dateNode)) { $publicationDate = ''; $requiresNormalization = false; foreach (array('Year' => 4, 'Month' => 2, 'Day' => 2) as $dateElement => $padding) { $dateElementNodes =& $dateNode->getElementsByTagName($dateElement); if ($dateElementNodes->length > 0) { if (!empty($publicationDate)) { $publicationDate .= '-'; } $dateElementFirstNode =& $dateElementNodes->item(0); $datePart = str_pad($dateElementFirstNode->textContent, $padding, '0', STR_PAD_LEFT); if (!is_numeric($datePart)) { $requiresNormalization = true; } $publicationDate .= $datePart; } else { break; } } // Normalize the date to NLM standard if necessary. if ($requiresNormalization) { $dateFilter = new DateStringNormalizerFilter(); $publicationDate = $dateFilter->execute($publicationDate); } if (!empty($publicationDate)) { $metadata['date'] = $publicationDate; } } // Get publication type $publicationTypeNodes =& $resultDOM->getElementsByTagName("PublicationType"); if ($publicationTypeNodes->length > 0) { foreach ($publicationTypeNodes as $publicationType) { // The vast majority of items on PubMed are articles so catch these... if (String::strpos(String::strtolower($publicationType->textContent), 'article') !== false) { $metadata['[@publication-type]'] = NLM30_PUBLICATION_TYPE_JOURNAL; break; } } } // Get DOI if it exists $articleIdNodes =& $resultDOM->getElementsByTagName("ArticleId"); foreach ($articleIdNodes as $idNode) { if ($idNode->getAttribute('IdType') == 'doi') { $metadata['pub-id[@pub-id-type="doi"]'] = $idNode->textContent; } } // Use eLink utility to find fulltext links $lookupParams = array('dbfrom' => 'pubmed', 'cmd' => 'llinks', 'tool' => 'pkp-wal', 'id' => $pmid); if (!is_null($resultDOM = $this->callWebService(PUBMED_WEBSERVICE_ELINK, $lookupParams))) { // Get a list of possible links foreach ($resultDOM->getElementsByTagName("ObjUrl") as $linkOut) { $attributes = ''; foreach ($linkOut->getElementsByTagName("Attribute") as $attribute) { $attributes .= String::strtolower($attribute->textContent) . ' / '; } // Only add links to open access resources if (String::strpos($attributes, "subscription") === false && String::strpos($attributes, "membership") === false && String::strpos($attributes, "fee") === false && $attributes != "") { $urlNodes =& $linkOut->getElementsByTagName("Url"); $urlFirstNode =& $urlNodes->item(0); $links[] = $urlFirstNode->textContent; } } // Take the first link if we have any left (presumably pubmed returns them in preferential order) if (isset($links[0])) { $metadata['uri'] = $links[0]; } } return $this->getNlm30CitationDescriptionFromMetadataArray($metadata); }