Exemplo n.º 1
0
function url2meta($url)
{
    global $config;
    $url = urldecode($url);
    $item = new stdClass();
    $item->status = 'failed';
    $item->authors = array();
    //echo $url;
    $match = array();
    //------------------------------------------------------------------------------
    // CiNii
    if (preg_match('/http:\\/\\/ci.nii.ac.jp\\/naid\\//', $url)) {
        $url = preg_replace('/\\/$/', '', $url);
        //echo $url;
        if (cinii_rdf($url . '/rdf', $item) == 0) {
            $item->status = 'ok';
            // fix this, some may have DOIs
            /*
            $doi = search_for_doi($item->issn, $item->volume, $item->spage, 'article', $item);		
            if ($doi != '')
            {
            	$item->doi = $doi;
            }
            */
        }
    }
    //------------------------------------------------------------------------------
    // DSpace
    if (preg_match('/\\/dspace\\//', $url)) {
        // AMNH
        if (preg_match('/http:\\/\\/digitallibrary.amnh.org\\/dspace\\//', $url)) {
            // rewrite URL
            $url = str_replace('http://digitallibrary.amnh.org/dspace/', '', $url);
            $url = str_replace('handle/', '', $url);
            $url = str_replace('bitstream/', '', $url);
            $url = preg_replace('/\\/1\\/N.*/', '', $url);
            $item->status = 'ok';
            $item->comment = 'url';
            $item->hdl = $url;
        }
        if (preg_match('/https:\\/\\/qir.kyushu-u.ac.jp\\/dspace\\//', $url)) {
            // rewrite URL
            $url = str_replace('https://qir.kyushu-u.ac.jp/dspace/', '', $url);
            $url = str_replace('handle/', '', $url);
            $url = str_replace('bitstream/', '', $url);
            $url = preg_replace('/\\/1\\/.*/', '', $url);
            $item->status = 'ok';
            $item->comment = 'url';
            $item->hdl = $url;
        }
    }
    //------------------------------------------------------------------------------
    // Blackwells
    if (preg_match('/http:\\/\\/www.blackwell-synergy.com\\//', $url)) {
        list($prefix, $suffix) = split('/doi/', $url);
        $suffix = str_replace("abs/", "", $suffix);
        $suffix = str_replace("full/", "", $suffix);
        $doi = preg_replace('/\\?.*/', '', $suffix);
        $item->status = 'ok';
        $item->doi = $doi;
    }
    //------------------------------------------------------------------------------
    // Cambridge
    if (preg_match('/http:\\/\\/journals.cambridge.org\\//', $url)) {
        fetchCambridge($url, $item);
    }
    //------------------------------------------------------------------------------
    // Ingenta
    if (preg_match('/http:\\/\\/www.ingentaconnect.com\\//', $url)) {
        fetchIngenta($url, $item);
    }
    //------------------------------------------------------------------------------
    // Informa
    if (preg_match('/http:\\/\\/www.informaworld.com\\//', $url)) {
        if ('' == $config['proxy_name']) {
            fetchInformaworld($url, $item);
        } else {
            if (preg_match('/http:\\/\\/www.informaworld.com\\/index\\/([0-9]+).pdf/', $url, $match)) {
                //echo $url;
                $url = 'http://www.informaworld.com/smpp/content~db=all~content=a' . $match[1];
                fetchInformaworld($url, $item);
            } else {
                fetchInformaworld($url, $item);
            }
        }
    }
    //------------------------------------------------------------------------------
    // T&F
    if (preg_match('/http:\\/\\/taylorandfrancis.metapress.com\\//', $url)) {
        // We get bounced to Informaworld, which has DC meta
        if ('' == $config['proxy_name']) {
            fetchInformaworld($url, $item);
        } else {
            if (preg_match('/http:\\/\\/www.informaworld.com\\/index\\/([0-9]+).pdf/', $url, $match)) {
                //echo $url;
                $url = 'http://www.informaworld.com/smpp/content~db=all~content=a' . $match[1];
                fetchInformaworld($url, $item);
            }
        }
    }
    //------------------------------------------------------------------------------
    // Wiley
    if (preg_match('/http:\\/\\/doi.wiley.com\\//', $url)) {
        $url = str_replace('http://doi.wiley.com/', '', $url);
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $url;
    }
    //------------------------------------------------------------------------------
    // Wiley
    if (preg_match('/http:\\/\\/onlinelibrary.wiley.com\\/doi\\/(?<doi>.*)\\/abstract/Uu', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match['doi'];
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/apsjournals.apsnet.org\\/doi\\/abs\\//', $url)) {
        $url = str_replace('http://apsjournals.apsnet.org/doi/abs/', '', $url);
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $url;
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/biology.plosjournals.org\\/perlserv(\\/)?\\?request=get\\-document&doi=(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match[2];
    }
    //------------------------------------------------------------------------------
    // http://www.journals.uchicago.edu/doi/pdf/10.1086/376890
    if (preg_match('/http:\\/\\/www.journals.uchicago.edu\\/doi\\/(abs|full|pdf)\\/(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match[2];
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/arjournals.annualreviews.org\\/doi\\/(abs|full)\\/(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match[2];
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/www.journals.uchicago.edu\\/cgi-bin\\/resolve\\?id=doi:(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match[1];
    }
    // New style...
    // http://www.bioone.org/doi/abs/10.1651/08-3058a.1?ai=tr&af=R
    // http://www.bioone.org/doi/pdf/10.3099/0006-9698-515.1.1
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/www.bioone.org\\/doi\\/(abs|full|pdf)\\/(?<doi>.*)[\\?]?/', $url, $match)) {
        //echo $match['doi'] . "\n";
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match['doi'];
        $pos = strpos($item->doi, '?');
        if ($pos === false) {
        } else {
            $item->doi = substr($item->doi, 0, $pos);
        }
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/pinnacle.allenpress.com\\/doi\\/(abs|full|pdf)\\/(?<doi>.*)\\??/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match['doi'];
        //print_r($item);
        $pos = strpos($item->doi, '?');
        if ($pos === false) {
        } else {
            $item->doi = substr($item->doi, 0, $pos);
        }
        //print_r($item);
    }
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/www.bioone.org\\/perlserv\\/\\?request=get-(abstract|document)&doi=(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $item->doi = $match[2];
        //echo $match[1];
        $cite_url = 'http://www.bioone.org/perlserv/?request=cite-builder&doi=' . urlencode($match[1]);
        //echo $cite_url;
        // Harvest as DOI may be broken...
        $html = get($cite_url);
        if ($html != '') {
            //echo $html;
            preg_match('/<a href="(\\?request=download-citation&#38;t=refman&#38;f=([0-9]{4}\\-[0-9]{3}([0-9]|X))(.*))">Reference Manager/', $html, $match);
            //print_r($match);
            if (isset($match[1])) {
                $issn = $match[2];
                $item->issn = $match[2];
                $ris_url = 'http://www.bioone.org/perlserv/' . $match[1];
                $ris_url = str_replace('&#38;', '&', $ris_url);
                //echo '<b>', $ris_url, '</b><br/>';
                $ris = get($ris_url);
                parseRIS($ris, $item);
                // Make DOI-safe URL
                $item->url = sprintf('http://www.bioone.org/perlserv/?request=get-abstract' . '&issn=%s' . '&volume=%03d' . '&issue=%02d' . '&page=%04d', $item->issn, $item->volume, $item->issue, $item->spage);
            }
        }
    }
    //------------------------------------------------------------------------------
    // Non-DOI version of BioOne URL
    if (preg_match('/http:\\/\\/www.bioone.org\\/perlserv\\/\\?request=get-abstract&issn=(.*)/', $url, $match)) {
        $item->status = 'ok';
        $item->comment = 'url';
        $html = get($url);
        if ($html != '') {
            if (preg_match('/<a href="\\?request=cite-builder&doi=(.*)">Create Reference/', $html, $match)) {
                $cite_url = 'http://www.bioone.org/perlserv/?request=cite-builder&doi=' . $match[1];
                // Harvest as DOI may be broken...
                $html = get($cite_url);
                if ($html != '') {
                    //echo $html;
                    preg_match('/<a href="(\\?request=download-citation&#38;t=refman&#38;f=([0-9]{4}\\-[0-9]{3}([0-9]|X))(.*))">Reference Manager/', $html, $match);
                    //print_r($match);
                    if (isset($match[1])) {
                        $issn = $match[2];
                        $item->issn = $match[2];
                        $ris_url = 'http://www.bioone.org/perlserv/' . $match[1];
                        $ris_url = str_replace('&#38;', '&', $ris_url);
                        //echo '<b>', $ris_url, '</b><br/>';
                        $ris = get($ris_url);
                        parseRIS($ris, $item);
                        // Make DOI-safe URL
                        $item->url = sprintf('http://www.bioone.org/perlserv/?request=get-abstract' . '&issn=%s' . '&volume=%03d' . '&issue=%02d' . '&page=%04d', $item->issn, $item->volume, $item->issue, $item->spage);
                    }
                }
            }
        }
    }
    //------------------------------------------------------------------------------
    // Springer (metapress)
    if (preg_match('/http:\\/\\/www.springerlink.com\\//', $url)) {
        $url = str_replace('http://www.springerlink.com/index/', '', $url);
        $url = str_replace('.pdf', '', $url);
        $url = 'http://www.springerlink.com/content/' . $url;
        //print $url;
        $html = get($url);
        //echo $html;
        if ($html != '') {
            $item->status = 'ok';
            $match = array();
            preg_match('/<td class="labelName">DOI<\\/td><td class="labelValue">(.*)<\\/td>/', $html, $match);
            if (isset($match[1])) {
                $item->doi = $match[1];
            }
        }
    }
    //------------------------------------------------------------------------------
    // Royal Society (Metapress)
    if (preg_match('/http:\\/\\/www.journals.royalsoc.ac.uk\\//', $url)) {
        // rewrite URL
        $url = str_replace('http://www.journals.royalsoc.ac.uk/index/', '', $url);
        $url = str_replace('.pdf', '', $url);
        $url = 'http://journals.royalsociety.org/content/' . $url;
        // look for DOI
        $html = get($url);
        if ($html != '') {
            $item->status = 'ok';
            $match = array();
            preg_match('/DOI<\\/td><td class="labelValue">(.*)<\\/td>/', $html, $match);
            if (isset($match[1])) {
                $item->doi = $match[1];
            }
        }
    }
    //------------------------------------------------------------------------------
    // NCBI
    if (preg_match('/http:\\/\\/www.ncbi.nlm.nih.gov/', $url)) {
        $match = array();
        preg_match('/list_uids=([0-9]+)/', $url, $match);
        if (isset($match[1])) {
            $item->status = 'ok';
            $item->comment = 'url';
            $item->pmid = $match[1];
        }
        if (preg_match('/http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/([0-9]+)/', $url, $match)) {
            $item->status = 'ok';
            $item->comment = 'url';
            $item->pmid = $match[1];
        }
    }
    #http://www.ncbi.nlm.nih.gov/pubmed/11029000
    //------------------------------------------------------------------------------
    // Allen Press (another DOI-buggering organisation
    if (preg_match('/http:\\/\\/apt.allenpress.com\\//', $url)) {
        // Unpack URL
        //echo 'allen';
        parseAllenUrl($url, $item);
    }
    //------------------------------------------------------------------------------
    // Raffles
    if (preg_match('/http:\\/\\/rmbr.nus.edu.sg\\/rbz\\//', $url)) {
        // extract details from URL
        $match = array();
        preg_match('/http:\\/\\/rmbr.nus.edu.sg\\/rbz\\/biblio\\/([0-9]+)\\/([0-9]+)rbz([0-9]+)(\\-([0-9]+))?.pdf/', $url, $match);
        /*	echo '<pre>';
        		print_r($match);
        		echo '</pre>';*/
        if (6 == count($match)) {
            $item->status = 'ok';
            $spage = $match[3];
            $spage = preg_replace('/^0*/', '', $spage);
            $epage = $match[5];
            $epage = preg_replace('/^0*/', '', $epage);
            $item->spage = $spage;
            $item->epage = $epage;
            $item->volume = $match[1];
            $item->issn = '0217-2445';
            $item->url = $url;
        }
    }
    //------------------------------------------------------------------------------
    // CSIRO
    // http://www.publish.csiro.au/nid/150/paper/display/citation/paper/SB9910229.htm
    //echo $url;
    if (preg_match('/(http:\\/\\/www.publish.csiro.au\\/\\?paper=(.*)|http:\\/\\/www.publish.csiro.au\\/paper\\/(.*).htm|http:\\/\\/www.publish.csiro.au\\/nid\\/[0-9]+\\/display\\/citation\\/paper\\/(.*).htm|http:\\/\\/www.publish.csiro.au\\/\\?act=view_file&file_id=(.*).pdf|http:\\/\\/www.publish.csiro.au\\/nid\\/[0-9]+\\/paper\\/(.*).htm)/', $url, $match)) {
        //print_r($match);
        //echo '<br/>';
        $id = '';
        if (isset($match[2])) {
            $id = $match[2];
        }
        if (isset($match[3])) {
            $id = $match[3];
        }
        if (isset($match[5])) {
            $id = $match[5];
        }
        $url = 'http://www.publish.csiro.au/?paper=' . $id;
        $html = get($url);
        if ($html != '') {
            $item->status = 'ok';
            $match = array();
            $ms = '/doi:((.*)\\/' . $id . ')/';
            //echo $ms, '<br/>';
            preg_match($ms, $html, $match);
            //print_r($match);
            if (isset($match[1])) {
                $item->doi = $match[1];
            }
        }
        /*		print_r($match);
        		
        		$ris_url = 'http://www.publish.csiro.au/view/journals/dsp_journal_retrieve_citation.cfm?ct='
        		 . $match[2] . '.ris';
        		 
        		$ris = get($ris_url);
        		
        		if ($ris == '')
        		{
        			$item->status = 'failed';
        		}
        		else
        		{
        			$item->status = 'ok';
        			parseRIS($ris, $item);
        		}*/
    }
    //http://www.amjbot.org/cgi/content/short/96/7/1348?rss=1
    //------------------------------------------------------------------------------
    if (preg_match('/http:\\/\\/sjg.lyellcollection.org\\/content\\/(?<item>([0-9]+\\/[0-9]+\\/[0-9]+))/', $url, $match)) {
        print_r($match);
        $ris_url = 'http://sjg.lyellcollection.org/citmgr?type=refman&gca=sjg;' . $match['item'];
        $ris = get($ris_url);
        //echo $ris;
        if ($ris == '') {
            $item->status = 'failed';
        } else {
            $item->status = 'ok';
            parseRIS($ris, $item);
        }
    }
    //------------------------------------------------------------------------------
    // Highwire Press
    if (preg_match('/http:\\/\\/(?<prefix>(www.))?(?<journal>.*).org\\/((content\\/(abstract|short))|(reprint))\\/(?<item>([0-9]+\\/[0-9]+\\/[0-9]+))/', $url, $match)) {
        //print_r($match);
        $ris_url = 'http://' . $match['prefix'] . $match['journal'] . '.org/cgi/citmgr?type=refman&gca=';
        //echo $match['journal'], '<br/>';
        switch ($match['journal']) {
            case 'aob.oxfordjournals':
                $ris_url .= 'annbot';
                $item->issn = '0305-7364';
                break;
            case 'biolbull':
                $ris_url .= 'biolbull';
                $item->issn = '0006-3185';
                break;
            case 'sciencemag':
                $ris_url .= 'sci';
                $item->issn = '0036-8075';
                break;
            case 'nar.oxfordjournals':
                $ris_url .= 'nar';
                $item->issn = '0305-1048';
                break;
            case 'aem.asm':
                $ris_url .= 'aem';
                $item->issn = '0099-2240';
                break;
            case 'ijs.sgmjournals':
            case 'ijsb.sgmjournals':
                $ris_url .= 'ijs';
                $item->issn = '1466-5026';
                break;
            case 'bioinformatics.oxfordjournals':
                $ris_url .= 'bioinfo';
                $item->issn = '1367-4803';
                break;
            case 'icb.oxfordjournals':
                $ris_url .= 'icbiol';
                $item->issn = '1540-7063';
                break;
            case 'jeb.biologists':
                $ris_url .= 'jexbio';
                $item->issn = '0022-0949';
                break;
            case 'jvi.asm':
                $ris_url .= 'jvi';
                $item->issn = '0022-538X';
                break;
            case 'jwildlifedis':
                $ris_url .= 'wildlifedis';
                $item->issn = '0090-3558';
                break;
            case 'mbe.oxfordjournals':
                $ris_url .= 'molbiolevol';
                $item->issn = '0737-4038';
                break;
            case 'mollus.oxfordjournals':
                $ris_url .= 'mollus';
                $item->issn = '0260-1230';
                break;
            case 'studiesinmycology':
                $ris_url .= 'simycol';
                $item->issn = '0166-0616';
                break;
            case 'beheco.oxfordjournals':
                $ris_url .= 'beheco';
                $item->issn = '1045-2249';
                break;
            case 'aje.oxfordjournals':
                $ris_url .= 'amjepid';
                $item->issn = '0002-9262';
                break;
            case 'reproduction-online':
                $ris_url .= 'reprod';
                $item->issn = '1470-1626';
                break;
            case 'sjg.lyellcollection':
                $ris_url .= 'sjg';
                $item->issn = '0036-9276';
                break;
            default:
                $ris_url .= $match['journal'];
                break;
        }
        $ris_url .= ';' . $match['item'];
        //echo $ris_url;
        $ris = get($ris_url);
        //echo $ris;
        if ($ris == '') {
            $item->status = 'failed';
        } else {
            $item->status = 'ok';
            parseRIS($ris, $item);
        }
    }
    // JSTOR stable
    // http://www.jstor.org/stable/pdfplus/1446094.pdf
    if (preg_match('/http:\\/\\/www.jstor.org\\/(pss|stable)\\/(pdfplus\\/)?(?<id>[0-9]+)(\\.pdf)?/', $url, $match)) {
        //print_r($match);
        $id = $match['id'];
        $html = '';
        //echo $html;
        if ('' == $config['proxy_name']) {
            // Outside Glasgow so we get metadata directly
            $html = get($url);
        } else {
            // At Glasgow, one more step
            $u = 'http://www.jstor.org/stable/info/' . $id . '?seq=1';
            $html = get($u);
        }
        //echo $html;
        // Add line feeds so regular expresison works
        $html = str_replace('<meta', "\n<meta", $html);
        // Pull out the meta tags
        preg_match_all("|<meta\\s*name=\"(dc.[A-Za-z]*)\"\\s*(scheme=\"(.*)\")?\\s*(content=\"(.*)\")><\\/meta>|", $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        /*
        echo "<pre>";
        print_r($out);
        echo "</pre>";
        */
        parseDcMeta($out, $item);
        $html = str_replace("\n", "", $html);
        if (preg_match('/
		<title>
		JSTOR:\\s+
		(?<title>(.*)),
		\\s+
		Vol.\\s+(?<volume>\\d+),
		(\\s+No.\\s+(?<issue>\\d+([-\\/]\\d+)?)\\s+)?
		\\(((.*),\\s+)?(?<year>[0-9]{4})\\),\\s+
		pp.\\s+(?<spage>\\d+)-(?<epage>\\d+)
		<\\/title>
		/Ux', $html, $matches)) {
            //print_r($matches);
            $item->title = $matches['title'];
            $item->volume = $matches['volume'];
            $item->issue = $matches['issue'];
            $item->spage = $matches['spage'];
            $item->epage = $matches['epage'];
            $item->year = $matches['year'];
        }
        // Kill DOI on assumption we wouldn't be harvesting stable URL if DOI worked
        //unset($item->doi);
        if (!isset($item->issn)) {
            $issn = issn_from_journal_title($item->title);
            if ($issn != '') {
                $item->issn = $issn;
            }
        }
        if (isset($item->issn)) {
            $doi = search_for_doi($item->issn, $item->volume, $item->spage, 'article', $item);
            if ($doi == '') {
                unset($item->doi);
            } else {
                $item->doi = $doi;
            }
        }
        $item->status = 'ok';
        $item->url = 'http://www.jstor.org/stable/' . $id;
    }
    //------------------------------------------------------------------------------
    // JSTOR
    if (preg_match('/http:\\/\\/(www|links).jstor.org\\/sici\\?sici=(.*)/', $url, $match)) {
        $sici = $match[2];
        $html = get($url);
        //echo $html;
        if ('' == $config['proxy_name']) {
            // Outside Glasgow so we get metadata directly
        } else {
            // Inside Glasgow, we are licensed, so we need one more step
            // Extract stable indentifier
            preg_match('/&amp;suffix=([0-9]+)/', $html, $match);
            //print_r($match);
            if (isset($match[1])) {
                $item->stable = $match[1];
                $item->url = 'http://www.jstor.org/stable/' . $match[1];
                // ok, harvest
                $html = get('http://www.jstor.org/stable/info/' . $match[1]);
            }
        }
        // Add line feeds so regular expression works
        $html = str_replace('<meta', "\n<meta", $html);
        // Pull out the meta tags
        preg_match_all("|<meta\\s*name=\"(dc.[A-Za-z]*)\"\\s*(scheme=\"(.*)\")?\\s*(content=\"(.*)\")><\\/meta>|", $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        /*echo "<pre>";
        		echo htmlentities($r);
        		echo "</pre>";*/
        parseDcMeta($out, $item);
        // Can we get anything more out of SICI?
        $out = unpack_sici($sici);
        if (isset($out['issn'])) {
            $item->issn = $out['issn'];
        }
        if (isset($out['year'])) {
            $item->year = $out['year'];
        }
        if (isset($out['volume'])) {
            $item->volume = $out['volume'];
        }
        if (isset($out['issue'])) {
            $item->issue = $out['issue'];
        }
        if (isset($out['site'])) {
            $item->spage = $out['site'];
        }
        $item->status = 'ok';
        $item->sici = $sici;
    }
    //------------------------------------------------------------------------------
    // Elsevier
    // http://linkinghub.elsevier.com/retrieve/pii/S0885576501903558
    if (preg_match('/http:\\/\\/linkinghub.elsevier.com/', $url)) {
        $html = get($url);
        //echo $html;
        preg_match('/doi:(.*)<\\/a>&nbsp;/', $html, $match);
        if (isset($match[1])) {
            $item->status = 'ok';
            $item->doi = $match[1];
        }
        if (preg_match('/http:\\/\\/linkinghub.elsevier.com\\/retrieve\\/pii\\/(?<pii>S(.*))/', $url, $match)) {
            $item->pii = $match['pii'];
        }
        $item->status = 'ok';
    }
    //------------------------------------------------------------------------------
    //http://www.sciencedirect.com/science?_ob=GatewayURL&_origin=IRSSSEARCH&_method=citationSearch&_piikey=S072320200800043X&_version=1&md5=952f39fe9bd26f49f29dd4282cc2f224
    if (preg_match('/http:\\/\\/www.sciencedirect.com/', $url)) {
        $html = get($url);
        //echo $html;
        preg_match('/doi:(.*)<\\/a>&nbsp;/', $html, $match);
        if (isset($match[1])) {
            $item->status = 'ok';
            $item->doi = $match[1];
        }
        $item->status = 'ok';
    }
    // http://rparticle.web-p.cisti.nrc.ca/rparticle/AbstractTemplateServlet?calyLang=eng&journal=cjz&volume=56&year=0&issue=3&msno=z78-059
    if (preg_match('/http:\\/\\/rparticle.web\\-p.cisti.nrc.ca/', $url)) {
        $html = get($url);
        $item->url = $url;
        // NRC does things a little differently...
        preg_match_all("|<meta\\s*name=\"(?<key>((?<namespace>[A-Za-z]+)\\.)?(?<value>[A-Za-z]+))\"\\s*(content=\"(?<content>.*)\")\\s*\\/>|", $html, $out, PREG_PATTERN_ORDER);
        //		$r = print_r ($out, true);
        //print_r($out);
        $authors = array();
        foreach ($out['key'] as $k => $v) {
            //echo $k . ' ' . $v . " " . $out['content'][$k] . "\n";
            switch ($v) {
                case 'dc.title':
                    $atitle = $out['content'][$k];
                    $atitle = html_entity_decode($atitle, ENT_QUOTES, "utf-8");
                    $atitle = strip_tags($atitle);
                    $item->atitle = $atitle;
                    break;
                case 'title.journal':
                    if (!isset($item->title)) {
                        $item->title = $out['content'][$k];
                        switch ($item->title) {
                            case 'Can. J. Zool.':
                                $item->issn = '0008-4301';
                                break;
                            default:
                                break;
                        }
                    }
                    break;
                case 'author':
                    if (count($authors) == 0) {
                        $a = $out['content'][$k];
                        // split string
                        // Protect Jr
                        $a = str_replace(", JR", " Jr", $a);
                        $authors = str_replace(", Jr", " Jr", $a);
                        $a = preg_replace("/,? and /", "|", $a);
                        $a = str_replace(", ", "|", $a);
                        $authors = explode("|", $a);
                    }
                    break;
                case 'identifier.doi':
                    $item->doi = $out['content'][$k];
                    break;
                case 'identifier.volume':
                    $item->volume = $out['content'][$k];
                    break;
                case 'identifier.issue':
                    $item->issue = $out['content'][$k];
                    break;
                case 'identifier.startpage':
                    $item->spage = $out['content'][$k];
                    break;
                case 'date.published':
                    $item->year = $out['content'][$k];
                    break;
                default:
                    $item->status = 'ok';
                    break;
            }
        }
        // Abstract
        $html = str_replace("\n", " ", $html);
        $html = str_replace("\r", " ", $html);
        $match = array();
        if (preg_match('/<a name="abs_english" id="abs_english"><!--abs_english--><\\/a>(.*)<a name="abs_french" id="abs_french">/', $html, $match)) {
            print_r($match);
            $abstract = $match[1];
            $abstract = str_replace("<b>Abstract: </b>", "", $abstract);
            $abstract = html_entity_decode($abstract, ENT_QUOTES, "utf-8");
            $abstract = strip_tags($abstract);
            $abstract = trim($abstract);
            $item->abstract = $abstract;
        }
        // Clean up authors
        $item->authors = array();
        $authors = array_unique($authors);
        foreach ($authors as $a) {
            $a = mb_convert_case($a, MB_CASE_TITLE, mb_detect_encoding($a));
            // Get parts of name
            $parts = parse_name($a);
            $author = new stdClass();
            if (isset($parts['last'])) {
                $author->lastname = $parts['last'];
            }
            if (isset($parts['suffix'])) {
                $author->suffix = $parts['suffix'];
            }
            if (isset($parts['first'])) {
                $author->forename = $parts['first'];
                if (array_key_exists('middle', $parts)) {
                    $author->forename .= ' ' . $parts['middle'];
                }
            }
            array_push($item->authors, $author);
        }
    }
    // Get some metadata from Naturalis (because their OAI service is f****d
    if (preg_match('/http:\\/\\/www.repository.naturalis.nl\\/record/', $url)) {
        $html = get($url);
        //echo $html;
        $item->url = $url;
        $match = array();
        if (preg_match("/Pages<\\/td><td class=\"value\">(?<spage>[0-9]+)(\\-(?<epage>[0-9]+))?/", $html, $match)) {
            //print_r($match);
            $item->spage = $match['spage'];
            $item->epage = $match['epage'];
        }
    }
    //------------------------------------------------------------------------------
    // Scielo
    // http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1679-62252009000100001&lng=en&nrm=iso&tlng=en
    if (preg_match('/http:\\/\\/www.scielo.br/', $url)) {
        //echo $url;
        if (preg_match('/pid=(?<id>S(?<issn>[0-9]{4}\\-[0-9]{3}[0-9|X])[0-9]+)/', $url, $match)) {
            //print_r($match);
            $url = 'http://www.scielo.br/scieloOrg/php/articleXML.php?pid=' . $match['id'] . '&lang=en';
            //echo "Url:$url\n";
            $xml = get($url);
            //echo $xml;
            $xp = new XsltProcessor();
            $xsl = new DomDocument();
            $xsl->load('xsl/scielo.xsl');
            $xp->importStylesheet($xsl);
            $xml_doc = new DOMDocument();
            $xml_doc->loadXML($xml);
            $json = $xp->transformToXML($xml_doc);
            $item = json_decode($json);
            // Handle cases like spage = 01
            if (isset($item->spage)) {
                $item->spage = preg_replace('/^0/', '', $item->spage);
            }
            // ISSN is set in XSL, but make sure it is correct in case we forget...
            if (isset($item->issn)) {
                $item->issn = $match['issn'];
            }
            //print_r($obj);
            $item->status = 'ok';
        }
        /*
        		
        		$html = get($url);
        	
        		//echo $html;		
        			
        		preg_match('/doi:\s*(?<doi>10.[0-9]+\/(S[0-9]{4}\-[0-9X]*))/', $html, $match);
        		if (isset($match[1]))
        		{
        			$item->status = 'ok';
        			$item->doi = $match['doi'];
        			$item->url = $url;
        			
        			//print_r($match);
        		}
        		preg_match('/pid=(S[0-9]{4}\-[0-9X]*)&/', $url, $match);
        		if (isset($match[1]))
        		{
        			$item->publisher_id = $match[1];
        			$item->xml_url = 'http://www.scielo.br/scieloOrg/php/articleXML.php?pid=' . $item->publisher_id . '&lang=en';
        			
        			// If we haven't gotten a DOI do something here...
        		}
        		
        		$item->status = 'ok';
        */
    }
    //------------------------------------------------------------------------------
    // Scielo Chile
    if (preg_match('/http:\\/\\/www.scielo.cl/', $url)) {
        $html = get($url);
        //echo $html;
        preg_match('/doi:\\s*(?<doi>10.[0-9]+\\/(S[0-9]{4}\\-[0-9X]*))/', $html, $match);
        if (isset($match[1])) {
            $item->status = 'ok';
            $item->doi = $match['doi'];
            $item->url = $url;
            //print_r($match);
        }
        preg_match('/pid=(S[0-9]{4}\\-[0-9X]*)&/', $url, $match);
        if (isset($match[1])) {
            $item->publisher_id = $match[1];
            $item->xml_url = 'http://www.scielo.cl/scieloOrg/php/articleXML.php?pid=' . $item->publisher_id . '&lang=en';
            // If we haven't gotten a DOI do something here...
        }
        $item->status = 'ok';
    }
    //------------------------------------------------------------------------------
    // hindawi
    // http://www.hindawi.com/GetArticle.aspx?doi=10.1155/1875/15751
    if (preg_match('/http:\\/\\/www.hindawi.com/', $url, $match)) {
        if (preg_match('/doi=(.*)$/', $url, $match)) {
            //print_r($match);
            $item->status = 'ok';
            $item->doi = $match[1];
        } else {
            $html = get($url);
            //echo $html;
            preg_match("/doi:(.*)<\\/pre>/", $html, $match);
            if (isset($match[1])) {
                $item->status = 'ok';
                $item->doi = $match[1];
                $item->url = $url;
                //print_r($match);
            }
        }
    }
    //------------------------------------------------------------------------------
    // Zookeys
    if (preg_match('/http:\\/\\/pensoftonline.net\\/zookeys\\/(.*)\\/(?<article>([0-9]+))$/', $url, $match)) {
        $url = 'http://pensoftonline.net/zookeys/index.php/journal/article/viewArticle/' . $match['article'];
        //echo $url;
        $html = get($url);
        //echo $html;
        // Get meta tags (may have useful info, such as issn)
        preg_match_all('|<meta[^>]+name=\\"([^\\"]*)\\"\\s*(scheme=\\"([^\\"]*)\\"\\s*)*content=\\"([^\\"]*)\\"[^>]+>|', $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        //print_r($r);
        parseDcMeta($out, $item);
        //print_r($item);
        //exit();
        $item->status = 'ok';
    }
    //------------------------------------------------------------------------------
    // Nature
    // http://www.hindawi.com/GetArticle.aspx?doi=10.1155/1875/15751
    if (preg_match('/http:\\/\\/www.nature.com\\//', $url, $match)) {
        $html = get($url);
        //echo $html;
        // Get meta tags (may have useful info, such as issn)
        preg_match_all('|<meta[^>]+name=\\"([^\\"]*)\\"\\s*(scheme=\\"([^\\"]*)\\"\\s*)*content=\\"([^\\"]*)\\"[^>]+>|', $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        //print_r($r);
        parseDcMeta($out, $item);
        //print_r($item);
        //exit();
        $item->status = 'ok';
    }
    //
    // http://www.akademiai.com/content/w1t1p0022802658q/
    if (preg_match('/http:\\/\\/www.akademiai.com\\/content\\//', $url, $match)) {
        $html = get($url);
        //echo $html;
        if (preg_match('/DOI<\\/td><td class="labelValue">(?<doi>(.*))<\\/td>/', $html, $match)) {
            if (isset($match['doi'])) {
                $item->status = 'ok';
                $item->doi = $match['doi'];
                $item->url = $url;
            }
        }
        $item->status = 'ok';
    }
    // New Wiley
    // http://www3.interscience.wiley.com/journal/118594989/abstract
    if (preg_match('/http:\\/\\/www3.interscience.wiley.com\\/journal\\/[0-9]+\\/abstract/', $url, $match)) {
        $html = get($url);
        preg_match_all('|<meta[^>]+name=\\"([^\\"]*)\\"\\s*(scheme=\\"([^\\"]*)\\"\\s*)*content=\\"([^\\"]*)\\"[^>]+>|', $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        $citation = parse_google_citation($out);
        /*		if (isset($citation['citation_doi']))
        		{
        			$item->status = 'ok';
        			$item->doi = $citation['citation_doi'];
        			$item->url = $url;
        		}			
        */
        if (isset($citation['citation_issn'])) {
            $item->atitle = $citation['citation_title'];
            $item->title = $citation['citation_journal_title'];
            $item->issn = $citation['citation_issn'];
            $item->volume = $citation['citation_volume'];
            $item->issue = $citation['citation_issue'];
            $item->spage = $citation['citation_firstpage'];
            $item->epage = $citation['citation_lastpage'];
            $item->year = $citation['citation_date'];
            $item->publisher_id = $citation['citation_id'];
            $item->url = $url;
            if (preg_match("/[0-9]{4}\\/[0-9]{1,2}\\/[0-9]{1,2}/", $item->year)) {
                // Save the date (taxonomists care about this)
                $item->date = $item->year;
                if (-1 != strtotime($date)) {
                    $item->year = date("Y", strtotime($item->year));
                }
            }
            $item->doi = $citation['citation_doi'];
            $author_string = $citation['citation_authors'];
            $a = explode(";", trim($author_string));
            foreach ($a as $value) {
                // Get parts of name
                $parts = parse_name($value);
                $author = new stdClass();
                if (isset($parts['last'])) {
                    $author->lastname = $parts['last'];
                }
                if (isset($parts['suffix'])) {
                    $author->suffix = $parts['suffix'];
                }
                if (isset($parts['first'])) {
                    $author->forename = $parts['first'];
                    if (array_key_exists('middle', $parts)) {
                        $author->forename .= ' ' . $parts['middle'];
                    }
                }
                $item->authors[] = $author;
            }
            $item->status = 'ok';
        }
        //print_r($citation);
        //print_r($item);
        /*if (preg_match('/<span class="doi">(?<doi>.*)<\/span>/', $html, $match))
        		{
        			if (isset($match['doi']))
        			{
        				$item->status = 'ok';
        				$item->doi = $match['doi'];
        				$item->url = $url;
        			}
        		}
        		*/
        //$item->status = 'ok';
    }
    //
    // http://cat.inist.fr/?aModele=afficheN&cpsidt=6892704
    if (preg_match('/http:\\/\\/cat.inist.fr\\/\\?aModele=afficheN&cpsidt=/', $url, $match)) {
        $html = get($url);
        //echo $url;
        //echo $html;
        $html = str_replace('<meta', "\n<meta", $html);
        //echo $html;
        // <meta name="citation_issue" content="4">
        preg_match_all('|<meta[^>]+name=\\"([^\\"]*)\\"\\s+(scheme=\\"(.*)\\")?content=\\"(.*)\\">|U', $html, $out, PREG_PATTERN_ORDER);
        $r = print_r($out, true);
        /*echo "<pre>";
        		print_r($out);
        		echo "</pre>";*/
        $citation = parse_google_citation($out);
        /*if (isset($citation['citation_doi']))
        		{
        			$item->status = 'ok';
        			$item->doi = $citation['citation_doi'];
        			$item->url = $url;
        		}	
        		*/
        if (isset($citation['citation_issn'])) {
            $item->atitle = $citation['citation_title'];
            $item->title = $citation['citation_journal_title'];
            $item->issn = $citation['citation_issn'];
            $item->volume = $citation['citation_volume'];
            $item->issue = $citation['citation_issue'];
            $item->spage = $citation['citation_firstpage'];
            $item->epage = $citation['citation_lastpage'];
            $item->year = $citation['citation_date'];
            $item->publisher_id = $citation['citation_id'];
            $item->url = $url;
            $author_string = $citation['citation_authors'];
            $a = explode(";", trim($author_string));
            foreach ($a as $value) {
                // Get parts of name
                $parts = parse_name($value);
                $author = new stdClass();
                if (isset($parts['last'])) {
                    $author->lastname = $parts['last'];
                }
                if (isset($parts['suffix'])) {
                    $author->suffix = $parts['suffix'];
                }
                if (isset($parts['first'])) {
                    $author->forename = $parts['first'];
                    if (array_key_exists('middle', $parts)) {
                        $author->forename .= ' ' . $parts['middle'];
                    }
                }
                $item->authors[] = $author;
            }
            $item->status = 'ok';
        }
    }
    /*
    //http://pensoftonline.net/zookeys/index.php/journal/article/viewArticle/448
    
    if (preg_match('/http:\/\/www.pensoft.net\/journals\/zookeys\/article\//', $url))
    {
    	$html = get($url);
    	//echo $url;
    	//echo $html;
    	
    	//$html = str_replace("\n", ' ', $html);
    	preg_match_all('/<meta name="(?<name>.*)"\s+content="(?<content>.*)"\s+\/>/Uum',  $html, $m);
    	
    	echo '<pre>';
    	print_r($m);
    	echo '</pre>';
    	
    	$n = count($m[0]);
    	
    	foreach ($i=0;$i<$n;$i++)
    	{
    		switch ($m['name'][$i])
    		{
    			case 'dc.identifier':
    				$item->doi = $m['content'][$i];
    				break;
    				
    			case 'prism.issn':
    				$item->issn = $m['content'][$i];
    				break;
    			case 'prism.publicationName':
    				$item->title = $m['content'][$i];
    				break;
    				
    			default:
    				break;
    		}
    			
    	}
    	
    	print_r($item);
    				
    	
    	
    	
    }
    */
    //echo __LINE__ . "\n";
    //print_r($item);
    return $item;
}
Exemplo n.º 2
0
function jstor_metadata($sici, &$item)
{
    global $config;
    global $debug;
    $found = false;
    $url = 'http://links.jstor.org/sici?sici=' . urlencode($sici);
    //echo $url;
    $html = get($url);
    if ($debug) {
        echo '<pre style="text-align: left;border: 1px solid #c7cfd5;background: #f1f5f9;padding:15px;">';
        echo $url . "\n";
        echo htmlentities($html);
        echo "</pre>";
    }
    // Check for any error messages
    if (preg_match("/<h2>We're Sorry<\\/h2>/", $html)) {
        return $found;
    } else {
        $found = true;
    }
    if ('' == $config['proxy_name']) {
        // Outside Glasgow so we get metadata directly
    } else {
        // Inside Glasgow, we are licensed, so we need one more step
        // Extract stable indentifier
        if (preg_match('/stable\\/info\\/(?<jstorid>\\d+)\\?/', $html, $match)) {
            $stable = $match['jstorid'];
            $item->url = 'http://www.jstor.org/stable/' . $match[1];
            // ok, harvest
            $html = get('http://www.jstor.org/stable/info/' . $match[1]);
        }
    }
    //echo "url=" . $item->url;
    // Add line feeds so regular expresison works
    $html = str_replace('<meta', "\n<meta", $html);
    // Pull out the meta tags
    preg_match_all("|<meta\\s*name=\"(dc.[A-Za-z]*)\"\\s*(scheme=\"(.*)\")?\\s*(content=\"(.*)\")><\\/meta>|", $html, $out, PREG_PATTERN_ORDER);
    $r = print_r($out, true);
    parseDcMeta($out, $item);
    if ($debug) {
        echo '<h3>metadata</h3>';
        print_r($out);
    }
    $out = unpack_sici($sici);
    //print_r($out);
    if (isset($out['issn'])) {
        $item->issn = $out['issn'];
    }
    if (isset($out['year'])) {
        $item->year = $out['year'];
    }
    // Some JSTOR articles, such as Copeia, have all three elements in the enumeration,
    // so that the volume and issue are the second and third elements
    if (isset($out['locn'])) {
        if (isset($out['volume'])) {
            $item->volume = $out['issue'];
        }
        if (isset($out['issue'])) {
            $item->issue = $out['locn'];
        }
    } else {
        if (isset($out['volume'])) {
            $item->volume = $out['volume'];
        }
        if (isset($out['issue'])) {
            $item->issue = $out['issue'];
        }
    }
    if (isset($out['site'])) {
        $item->spage = $out['site'];
    }
    // Handle identifiers
    // Make stable URL
    if (isset($item->doi)) {
        $stable = $item->doi;
        $stable = str_replace("10.2307/", "", $stable);
        $stable = 'http://www.jstor.org/stable/' . $stable;
        $item->url = $stable;
    }
    /*if ($debug)
    	{
    		print_r($item);
    		echo __LINE__ . ' in ' . __FILE__ . "\n";
    		exit();
    	}*/
    // Is the DOI valid? (not all DOIs in the HTML metadata are valid
    if (isset($item->doi)) {
        $crossref_item = new stdClass();
        $exists = doi_metadata($item->doi, $crossref_item);
        if ($exists) {
            // DOI is cool, so add journal name
            if (isset($crossref_item->title)) {
                $item->title = $crossref_item->title;
            }
        } else {
            // Dud DOI, so remove it from the metadata
            unset($item->doi);
        }
    }
    // Might not have journal name
    if (!isset($item->title)) {
        $title = journal_title_from_issn($item->issn);
        if ($title != '') {
            $item->title = $title;
        }
    }
    return $found;
}