/** * @brief Parse a citation string * * @param citation Citation string to be parsed * @param reference Reference object to be populated * * @return True if citation successfully parsed, false otherwise */ function parse_citation($citation, &$reference) { $matched = false; $citation = str_replace("\n", '', trim($citation)); if (!$matched) { if (preg_match('/ (?<year>[0-9]{4}) ([a-z])? .? ((?<title>[^\\.]+|(?R))*) (\\.) \\s* (?<journal>[^\\d]+|(?R)) \\s* (?<volume>[0-9]+) \\s* (\\((?<issue>[0-9]+)\\))? \\s* [:|,] \\s* (?<spage>[0-9]+) [-|–] (?<epage>[0-9]+) /xu', $citation, $matches)) { //print_r($matches); $matched = true; foreach ($matches as $k => $v) { $matches[$k] = trim($v); } $reference->genre = 'article'; $reference->title = $matches['title']; $reference->secondary_title = $matches['journal']; $reference->secondary_title = preg_replace('/,$/', '', trim($reference->secondary_title)); $reference->volume = $matches['volume']; if (isset($matches['issue'])) { $reference->issue = $matches['issue']; } $reference->spage = $matches['spage']; $reference->epage = $matches['epage']; $reference->year = $matches['year']; } } // Wikipedia cite if (!$matched) { $rows = preg_split("/\\|/", $citation); $count = 0; foreach ($rows as $row) { if ($count > 0) { if (preg_match('/(?<key>([A-Za-z0-9_]+))\\s*=\\s*(?<value>(.*))/', $row, $matches)) { $matched = true; $value = trim($matches['value']); $value = preg_replace('/}}$/', '', $value); $value = preg_replace('/^\\[\\[/', '', $value); $value = preg_replace('/\\]\\]$/', '', $value); if ($value != '') { $key = trim($matches['key']); switch ($key) { case 'journal': $reference->secondary_title = $value; $reference->genre = 'article'; break; case 'pages': if (preg_match('/(?<spage>[0-9]+)[\\-|–](?<epage>[0-9]+)/u', $value, $match)) { $reference->spage = $match['spage']; $reference->epage = $match['epage']; } break; default: $reference->{$key} = $value; break; } } } } $count++; } } // Older Wikipedia-style if (!$matched) { if (preg_match('/^{{aut\\|/', $citation)) { if (preg_match_all("/\n\t\t\t\t([^']''([^'']+|(?R))*'')\n\t\t\t\t/x", $citation, $matches)) { //print_r($matches); $reference->secondary_title = $matches[2][count($matches[2]) - 1]; // get volume and pagination if (preg_match("/\n\t\t\t\t\t'''(?<volume>[0-9]+)'''\n\t\t\t\t\t\\s*\n\t\t\t\t\t(\\((?<issue>[0-9]+)\\))?\n\t\t\t\t\t\\s*\n\t\t\t\t\t:\n\t\t\t\t\t\\s*\n\t\t\t\t\t(?<spage>[0-9]+)\n\t\t\t\t\t[-|–]\n\t\t\t\t\t(?<epage>[0-9]+)\n\t\t\t\t\t/xu", $citation, $matches)) { //print_r($matches); $reference->volume = $matches['volume']; if (isset($matches['issue'])) { $reference->issue = $matches['issue']; } $reference->spage = $matches['spage']; $reference->epage = $matches['epage']; $reference->genre = 'article'; $matched = true; } // get title and year if (preg_match("/\\(?(?<year>[0-9]{4})\\)?:?\\s*(?<title>.*)''" . $obj->journal . "/", $citation, $matches)) { $reference->year = $matches['year']; $reference->title = trim($matches['title']); // clean up title $reference->title = str_replace("'", "", $reference->title); $reference->title = str_replace($reference->secondary_title, "", $reference->title); $reference->title = preg_replace('/' . $reference->volume . '$/', '', trim($reference->title)); $reference->title = preg_replace('/\\.$/', '', trim($reference->title)); } // journal may be a link to a wiki page, which may have a different name... if (preg_match('/\\[\\[((?<link>[A-Za-zÂÊÁÈÉËÍÎÌÏÓÔÒÛÚÙÜâêáèéëíîìïóôòûúùüÖÜÅÔØ0-9\\(\\)\\.,_\\- ]+)\\|(?<name>[A-Za-zÂÊÁÈÉËÍÎÌÏÓÔÒÛÚÙÜâêáèéëíîìïóôòûúùüÖÜÅÔØ0-9\\(\\)\\.,_\\- ]+))([^\\]\\]]+|(?R))*\\]\\]/', $reference->secondary_title, $matches)) { $reference->secondary_title = $matches['name']; } // ...or the same name $reference->secondary_title = preg_replace('/^\\[\\[(.*)\\]\\]$/', "\$1", $reference->secondary_title); if (preg_match_all("/\n\t\t\t\t\t(\\{\\{aut\\|([^\\}\\}]+|(?R))*\\}\\})\n\t\t\t\t\t/x", $citation, $a)) { $author_string = $a[2][0]; $author_string = str_replace(";", "|", $author_string); $author_string = str_replace("&", "|", $author_string); $authors = explode("|", $author_string); $reference->authors = array(); foreach ($authors as $author) { reference_add_author_from_string($reference, $author); } } } } } return $matched; }
function process_citation($citation) { $debug = false; $matched = false; $matches = array(); $series = ''; // Clean if (preg_match('/\\(Ser\\. (?<series>\\d+)\\)/', $citation, $matches)) { $series = $matches['series']; $citation = preg_replace('/\\(Ser\\. \\d+\\)/', '', $citation); } if (preg_match('/\\Series (?<series>\\d+),/', $citation, $matches)) { $series = $matches['series']; $citation = preg_replace('/Series \\d+,/', '', $citation); } $citation = preg_replace('/^ZOOTAXA/', "", $citation); // Fix pagination character from ZootaxaPDF $citation = preg_replace('/(\\d+)([|-|—|–])(\\d+)\\.$/u', "\$1-\$3", $citation); if (!$matched) { if (preg_match('/ (?<authorstring>.*) \\s+ \\((?<year>[0-9]{4})[a-z]?(\\s+"[0-9]{4}")?\\)[\\.|:]? \\s+ (?<title>(([^\\.]+|(?R))(\\.))+) (?<secondary_title>.*), \\s+ (?<volume>(\\d+|([L|X|I|V]+))) (\\s*\\((?<issue>\\d+([-|\\/]\\d+)?)\\))? , \\s+ (?<spage>[e]?\\d+) ( [-||-|—|–| ] (?<epage>\\d+) )? /xu', $citation, $matches)) { if ($debug) { echo __LINE__ . "\n"; print_r($matches); } $matches['genre'] = 'article'; $matched = true; } } /* // Rec Aust Mus if (!$matched) { if (preg_match('/ (?<authorstring>.*) \s+ (?<year>[0-9]{4})[a-z]? \. \s+ (?<title>(([^\.]+|(?R))(\.))+) (?<secondary_title>.*) \s+ (?<volume>(\d+|([L|X|I|V]+))) (\s*\((?<issue>\d+([-|\/]\d+)?)\))? : \s+ (?<spage>[e]?\d+) ( [-||-|—|–| ] (?<epage>\d+) )? /xu', $citation, $matches)) { if ($debug) { echo __LINE__ . "\n"; print_r($matches); } $matches['genre'] = 'article'; $matched = true; } } */ if (!$matched) { // Peters, J.A. 1964. Dictionary of Herpetology: A Brief and Meaningful Definition of Words and Terms Used in Herpetology. Hafner Publishing Company, New York. 393 pp. if (preg_match('/ (?<authorstring>.*) \\s+ [\\(]?(?<year>[0-9]{4})[a-z]?(\\s+"[0-9]{4}")?[\\)]? \\.? \\s+ (?<title>(([^\\.]+|(?R))(\\.))+) \\s+ (?<publisher>.*), (?<publoc>.*) [\\.|,] \\s+ (?<pages>\\d+) \\s+ pp. /xu', $citation, $matches)) { if ($debug) { echo __LINE__ . "\n"; print_r($matches); } $matches['genre'] = 'book'; $matched = true; } } // Post process $ref = new stdclass(); $ref->citation = $citation; if (!$matched) { echo "FAILED TO PARSE\n----------------------------------\n"; } else { if ($series != '') { $ref->series = $series; } foreach ($matches as $k => $v) { switch ($k) { case 'genre': case 'secondary_title': case 'issue': case 'spage': case 'epage': case 'year': case 'publisher': case 'publoc': case 'pages': if ($v != '') { $ref->{$k} = trim($v); } break; case 'title': $v = preg_replace('/\\.$/', '', trim($v)); $ref->{$k} = $v; break; case 'volume': // Clean up volume (if Roman convert to Arabic) if (!is_numeric($v)) { if (preg_match('/^[MDCLXVI]+$/', $v)) { $v = arabic($v); } } $ref->{$k} = $v; break; case 'authorstring': $v = preg_replace('/&/', '|', $v); $v = preg_replace('/(Jr\\.[,]?\\s+)/', "", $v); $v = preg_replace('/([A-Z]\\.),/', "\$1|", $v); $v = preg_replace('/\\|\\s*\\|/', "\$1|", $v); $v = preg_replace('/\\|$/', "", $v); $authors = explode("|", $v); //echo "authors=$v\n"; foreach ($authors as $a) { reference_add_author_from_string($ref, $a); } break; default: //echo "$k\n"; break; } } unset($ref->{0}); //print_r($ref); } return $ref; }