Example #1
0
/**
 * @brief Parse a citation string
 *
 * @param citation Citation string to be parsed
 * @param reference Reference object to be populated
 *
 * @return True if citation successfully parsed, false otherwise
 */
function parse_citation($citation, &$reference)
{
    $matched = false;
    $citation = str_replace("\n", '', trim($citation));
    if (!$matched) {
        if (preg_match('/
		
		(?<year>[0-9]{4})
		([a-z])?
		.?
		((?<title>[^\\.]+|(?R))*)
		(\\.)
		\\s*
		(?<journal>[^\\d]+|(?R))
		\\s*
		(?<volume>[0-9]+)
		\\s*
		(\\((?<issue>[0-9]+)\\))?
		\\s*
		[:|,]
		\\s*
		(?<spage>[0-9]+)
		[-|–]
		(?<epage>[0-9]+)
		/xu', $citation, $matches)) {
            //print_r($matches);
            $matched = true;
            foreach ($matches as $k => $v) {
                $matches[$k] = trim($v);
            }
            $reference->genre = 'article';
            $reference->title = $matches['title'];
            $reference->secondary_title = $matches['journal'];
            $reference->secondary_title = preg_replace('/,$/', '', trim($reference->secondary_title));
            $reference->volume = $matches['volume'];
            if (isset($matches['issue'])) {
                $reference->issue = $matches['issue'];
            }
            $reference->spage = $matches['spage'];
            $reference->epage = $matches['epage'];
            $reference->year = $matches['year'];
        }
    }
    // Wikipedia cite
    if (!$matched) {
        $rows = preg_split("/\\|/", $citation);
        $count = 0;
        foreach ($rows as $row) {
            if ($count > 0) {
                if (preg_match('/(?<key>([A-Za-z0-9_]+))\\s*=\\s*(?<value>(.*))/', $row, $matches)) {
                    $matched = true;
                    $value = trim($matches['value']);
                    $value = preg_replace('/}}$/', '', $value);
                    $value = preg_replace('/^\\[\\[/', '', $value);
                    $value = preg_replace('/\\]\\]$/', '', $value);
                    if ($value != '') {
                        $key = trim($matches['key']);
                        switch ($key) {
                            case 'journal':
                                $reference->secondary_title = $value;
                                $reference->genre = 'article';
                                break;
                            case 'pages':
                                if (preg_match('/(?<spage>[0-9]+)[\\-|–](?<epage>[0-9]+)/u', $value, $match)) {
                                    $reference->spage = $match['spage'];
                                    $reference->epage = $match['epage'];
                                }
                                break;
                            default:
                                $reference->{$key} = $value;
                                break;
                        }
                    }
                }
            }
            $count++;
        }
    }
    // Older Wikipedia-style
    if (!$matched) {
        if (preg_match('/^{{aut\\|/', $citation)) {
            if (preg_match_all("/\n\t\t\t\t([^']''([^'']+|(?R))*'')\n\t\t\t\t/x", $citation, $matches)) {
                //print_r($matches);
                $reference->secondary_title = $matches[2][count($matches[2]) - 1];
                // get volume and pagination
                if (preg_match("/\n\t\t\t\t\t'''(?<volume>[0-9]+)'''\n\t\t\t\t\t\\s*\n\t\t\t\t\t(\\((?<issue>[0-9]+)\\))?\n\t\t\t\t\t\\s*\n\t\t\t\t\t:\n\t\t\t\t\t\\s*\n\t\t\t\t\t(?<spage>[0-9]+)\n\t\t\t\t\t[-|–]\n\t\t\t\t\t(?<epage>[0-9]+)\n\t\t\t\t\t/xu", $citation, $matches)) {
                    //print_r($matches);
                    $reference->volume = $matches['volume'];
                    if (isset($matches['issue'])) {
                        $reference->issue = $matches['issue'];
                    }
                    $reference->spage = $matches['spage'];
                    $reference->epage = $matches['epage'];
                    $reference->genre = 'article';
                    $matched = true;
                }
                // get title and year
                if (preg_match("/\\(?(?<year>[0-9]{4})\\)?:?\\s*(?<title>.*)''" . $obj->journal . "/", $citation, $matches)) {
                    $reference->year = $matches['year'];
                    $reference->title = trim($matches['title']);
                    // clean up title
                    $reference->title = str_replace("'", "", $reference->title);
                    $reference->title = str_replace($reference->secondary_title, "", $reference->title);
                    $reference->title = preg_replace('/' . $reference->volume . '$/', '', trim($reference->title));
                    $reference->title = preg_replace('/\\.$/', '', trim($reference->title));
                }
                // journal may be a link to a wiki page, which may have a different name...
                if (preg_match('/\\[\\[((?<link>[A-Za-zÂÊÁÈÉËÍÎÌÏÓÔÒÛÚÙÜâêáèéëíîìïóôòûúùüÖÜÅÔØ0-9\\(\\)\\.,_\\- ]+)\\|(?<name>[A-Za-zÂÊÁÈÉËÍÎÌÏÓÔÒÛÚÙÜâêáèéëíîìïóôòûúùüÖÜÅÔØ0-9\\(\\)\\.,_\\- ]+))([^\\]\\]]+|(?R))*\\]\\]/', $reference->secondary_title, $matches)) {
                    $reference->secondary_title = $matches['name'];
                }
                // ...or the same name
                $reference->secondary_title = preg_replace('/^\\[\\[(.*)\\]\\]$/', "\$1", $reference->secondary_title);
                if (preg_match_all("/\n\t\t\t\t\t(\\{\\{aut\\|([^\\}\\}]+|(?R))*\\}\\})\n\t\t\t\t\t/x", $citation, $a)) {
                    $author_string = $a[2][0];
                    $author_string = str_replace(";", "|", $author_string);
                    $author_string = str_replace("&", "|", $author_string);
                    $authors = explode("|", $author_string);
                    $reference->authors = array();
                    foreach ($authors as $author) {
                        reference_add_author_from_string($reference, $author);
                    }
                }
            }
        }
    }
    return $matched;
}
Example #2
0
function process_citation($citation)
{
    $debug = false;
    $matched = false;
    $matches = array();
    $series = '';
    // Clean
    if (preg_match('/\\(Ser\\. (?<series>\\d+)\\)/', $citation, $matches)) {
        $series = $matches['series'];
        $citation = preg_replace('/\\(Ser\\. \\d+\\)/', '', $citation);
    }
    if (preg_match('/\\Series (?<series>\\d+),/', $citation, $matches)) {
        $series = $matches['series'];
        $citation = preg_replace('/Series \\d+,/', '', $citation);
    }
    $citation = preg_replace('/^ZOOTAXA/', "", $citation);
    // Fix pagination character from ZootaxaPDF
    $citation = preg_replace('/(\\d+)([­|-|—|–])(\\d+)\\.$/u', "\$1-\$3", $citation);
    if (!$matched) {
        if (preg_match('/
		(?<authorstring>.*)
		\\s+
		\\((?<year>[0-9]{4})[a-z]?(\\s+"[0-9]{4}")?\\)[\\.|:]?
		\\s+
		(?<title>(([^\\.]+|(?R))(\\.))+)
		(?<secondary_title>.*),
		\\s+
		(?<volume>(\\d+|([L|X|I|V]+)))
		(\\s*\\((?<issue>\\d+([-|\\/]\\d+)?)\\))?
		,
		\\s+
		(?<spage>[e]?\\d+)
		(
		[-|­|-|—|–| ]
		(?<epage>\\d+)
		)?
		/xu', $citation, $matches)) {
            if ($debug) {
                echo __LINE__ . "\n";
                print_r($matches);
            }
            $matches['genre'] = 'article';
            $matched = true;
        }
    }
    /*
    // Rec Aust Mus
    if (!$matched)
    {
    	if (preg_match('/
    	(?<authorstring>.*)
    	\s+
    	(?<year>[0-9]{4})[a-z]?
    	\.
    	\s+
    	(?<title>(([^\.]+|(?R))(\.))+)
    	(?<secondary_title>.*)
    	\s+
    	(?<volume>(\d+|([L|X|I|V]+)))
    	(\s*\((?<issue>\d+([-|\/]\d+)?)\))?
    	:
    	\s+
    	(?<spage>[e]?\d+)
    	(
    	[-|­|-|—|–| ]
    	(?<epage>\d+)
    	)?
    	/xu', $citation, $matches))
    	{
    		if ($debug) 
    		{
    			echo __LINE__ . "\n";
    			print_r($matches);	
    		}
    		$matches['genre'] = 'article';
    		$matched = true;
    	}	
    }	
    */
    if (!$matched) {
        // Peters, J.A. 1964. Dictionary of Herpetology: A Brief and Meaningful Definition of Words and Terms Used in Herpetology. Hafner Publishing Company, New York. 393 pp.
        if (preg_match('/
		(?<authorstring>.*)
		\\s+
		[\\(]?(?<year>[0-9]{4})[a-z]?(\\s+"[0-9]{4}")?[\\)]?
		\\.?
		\\s+
		(?<title>(([^\\.]+|(?R))(\\.))+)
		\\s+
		(?<publisher>.*),
		(?<publoc>.*)
		[\\.|,]
		\\s+
		(?<pages>\\d+)
		\\s+
		pp.		
		/xu', $citation, $matches)) {
            if ($debug) {
                echo __LINE__ . "\n";
                print_r($matches);
            }
            $matches['genre'] = 'book';
            $matched = true;
        }
    }
    // Post process
    $ref = new stdclass();
    $ref->citation = $citation;
    if (!$matched) {
        echo "FAILED TO PARSE\n----------------------------------\n";
    } else {
        if ($series != '') {
            $ref->series = $series;
        }
        foreach ($matches as $k => $v) {
            switch ($k) {
                case 'genre':
                case 'secondary_title':
                case 'issue':
                case 'spage':
                case 'epage':
                case 'year':
                case 'publisher':
                case 'publoc':
                case 'pages':
                    if ($v != '') {
                        $ref->{$k} = trim($v);
                    }
                    break;
                case 'title':
                    $v = preg_replace('/\\.$/', '', trim($v));
                    $ref->{$k} = $v;
                    break;
                case 'volume':
                    // Clean up volume (if Roman convert to Arabic)
                    if (!is_numeric($v)) {
                        if (preg_match('/^[MDCLXVI]+$/', $v)) {
                            $v = arabic($v);
                        }
                    }
                    $ref->{$k} = $v;
                    break;
                case 'authorstring':
                    $v = preg_replace('/&/', '|', $v);
                    $v = preg_replace('/(Jr\\.[,]?\\s+)/', "", $v);
                    $v = preg_replace('/([A-Z]\\.),/', "\$1|", $v);
                    $v = preg_replace('/\\|\\s*\\|/', "\$1|", $v);
                    $v = preg_replace('/\\|$/', "", $v);
                    $authors = explode("|", $v);
                    //echo "authors=$v\n";
                    foreach ($authors as $a) {
                        reference_add_author_from_string($ref, $a);
                    }
                    break;
                default:
                    //echo "$k\n";
                    break;
            }
        }
        unset($ref->{0});
        //print_r($ref);
    }
    return $ref;
}