/** * Function to begin parsing the local copy of the pubchem substances directory **/ function parse_substances() { $ignore = array(".", ".."); $input_dir = $this->getParameterValue('indir') . "/substances/"; $gz = false; $this->CreateDirectory($this->getParameterValue('outdir') . "/substances/"); parent::setDatasetURI("bio2rdf_dataset:bio2rdf-" . $this->getPcsPrefix() . "-" . date("Ymd")); $graph_uri = parent::getGraphURI(); //set graph URI to dataset uri if (parent::getParameterValue('dataset_graph') == true) { parent::setGraphURI(parent::getDatasetURI()); } $dataset_description = ''; if ($handle = opendir($input_dir)) { while (false !== ($file = readdir($handle))) { if (in_array($file, $ignore)) { continue; } echo "Processing file: " . $input_dir . $file . PHP_EOL; $suffix = parent::getParameterValue('output_format'); $outfile = realpath($this->getParameterValue('outdir')) . "/substances/" . basename($file, ".xml.gz") . "." . $suffix; if (strstr(parent::getParameterValue('output_format'), "gz")) { $gz = true; } echo "... into " . $outfile . PHP_EOL; parent::setCheckpoint('file'); $this->setWriteFile($outfile, $gz); $this->parse_substance_file($input_dir, $file); $this->getWriteFile()->close(); } closedir($handle); $source_file = (new DataResource($this))->setURI("http://www.ncbi.nlm.nih.gov/pcsubstance")->setTitle("PubChem Substance")->setRetrievedDate(date("Y-m-d\\TG:i:s\\Z", filemtime($input_dir)))->setFormat("text/xml")->setFormat("application/zip")->setPublisher("http://ncbi.nlm.nih.gov/")->setHomepage("http://pubchem.ncbi.nlm.nih.gov/")->setRights("use")->setRights("restricted-by-source-license")->setLicense("ftp://ftp.ncbi.nlm.nih.gov/pubchem/README")->setDataset("http://identifiers.org/pubchem.substance/"); $prefix = $this->getPcsPrefix(); $bVersion = parent::getParameterValue('bio2rdf_release'); $date = date("Y-m-d\\TG:i:s\\Z"); $output_file = (new DataResource($this))->setURI("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}")->setTitle("Bio2RDF v{$bVersion} RDF version of {$prefix} (generated at {$date})")->setSource($source_file->getURI())->setCreator("https://github.com/bio2rdf/bio2rdf-scripts/blob/master/pubchem/pubchem.php")->setCreateDate($date)->setHomepage("http://download.bio2rdf.org/release/{$bVersion}/{$prefix}/{$prefix}.html")->setPublisher("http://bio2rdf.org")->setRights("use-share-modify")->setRights("by-attribution")->setRights("restricted-by-source-license")->setLicense("http://creativecommons.org/licenses/by/3.0/")->setDataset(parent::getDatasetURI()); if ($gz) { $output_file->setFormat("application/gzip"); } if (strstr(parent::getParameterValue('output_format'), "nt")) { $output_file->setFormat("application/n-triples"); } else { $output_file->setFormat("application/n-quads"); } $dataset_description .= $source_file->toRDF() . $output_file->toRDF(); //set graph URI back to default parent::setGraphURI($graph_uri); // write the dataset description $this->setWriteFile($this->getParameterValue('outdir') . "/substances/" . $this->getBio2RDFReleaseFile()); $this->getWriteFile()->write($dataset_description); $this->getWriteFile()->close(); } else { echo "unable to read directory contents: " . $input_dir . "\n"; exit; } }