private function parseOptionKeyValuePair($optionKeyValuePair) { $keyValue = explode('=', $optionKeyValuePair); $key = trim($keyValue[0]); $value = trim($keyValue[1]); switch ($key) { case 'vextwarning': $this->options->setVendorExtensionIssuesAsWarnings(filter_var($value, FILTER_VALIDATE_BOOLEAN)); break; case 'output': $this->options->setOutputFormat($value); break; case 'lang': $this->options->setLanguage($value); break; case 'warning': $warningLevel = filter_var($value, FILTER_VALIDATE_INT, array('options' => array('min_range' => 0, 'default' => 2))); $this->options->setWarningLevel($warningLevel); break; case 'medium': $this->options->setMedium($value); break; case 'profile': $this->options->setProfile($value); break; } }
<?php /** * This file starts the DBpedia extraction process for abstracts. * * Warning: The script needs several days to complete on an average PC. */ error_reporting(E_ALL); // automatically loads required classes require 'dbpedia.php'; // set $extractionDir and $extractionLanguages require 'extractionconfig.php'; $manager = new ExtractionManager(); // loop over all languages foreach ($extractionLanguages as $currLanguage) { Options::setLanguage($currLanguage); $pageTitles = new ArticlesSqlIterator($currLanguage); $job = new ExtractionJob(new DatabaseWikipediaCollection($currLanguage), $pageTitles); $extractionDirLang = $extractionDir . '/' . $currLanguage . '/'; if (!is_dir($extractionDirLang)) { mkdir($extractionDirLang); } // AbstractExtractor has references to its two destinations, see below $group = new ExtractionGroup(new NullDestination()); $shortDestination = new csvNTripleDestination($extractionDirLang . "shortabstract_" . $currLanguage); $longDestination = new csvNTripleDestination($extractionDirLang . "longabstract_" . $currLanguage); $extractorInstance = new AbstractExtractor(); $extractorInstance->setDestinations($shortDestination, $longDestination); $group->addExtractor($extractorInstance); $job->addExtractionGroup($group); $date = date(DATE_RFC822);
foreach ($it as $key => $metainfo) { Timer::start("main::processing"); //****PREPROCESSING***** //print_r($metainfo); $pageTitle = $metainfo['pageTitle']; $pageTitles = new ArrayObject(array($pageTitle)); $pageURI = ''; try { $pageURI = RDFtriple::page($pageTitle); } catch (Exception $e) { Logger::warn('main: invalid uri for ' . $pageTitle); continue; } Logger::info("Title: {$pageTitle} " . mb_detect_encoding($pageTitle) . ""); $language = $metainfo['language']; Options::setLanguage($language); $lastarticlestmp[] = $pageURI->getURI(); $metainfo['oaiidentifier'] = $metainfo['oaiId']; $metainfo['oaiId'] = Util::getOaiIDfromIdentifier($language, $metainfo['oaiidentifier']); Logger::info("oaiId " . $metainfo['oaiId']); //***MAINTAINANCE*** $count++; //log statistics if ($count % Options::getOption('printStatInterval') == 0) { printAll($lastarticles, $language); } //50 last articles to statisticdir if ($count < 50) { $lastarticles = $lastarticlestmp; } if ($count % 50 == 0) {
* the LiveWikipedia. The file outputs the generated triples * directly. This is the best way for developers to verify that * their extractors are working. Once the extractor is working on * an article, developers should use extract_dataset to produce a * full data set and see whether it works in a full extraction. * * See http://wiki.dbpedia.org/Documentation for an overview of * the DBpedia extraction framework. * * @author Jens Lehmann */ include 'dbpedia.php'; // configure settings // change the Extractor class to your extractor //this should be done in config/dbpedia.ini Options::setLanguage('ko'); $language = Options::getOption('language'); //$extractor = new ActiveAbstractExtractor(); $extractor = new KoInfoboxExtractor(); /* $extractor = new InfoboxExtractor(); */ //$extractor = new SkosCategoriesExtractor(); //these are articles for testing //$article[] = 'London'; //$article[] = 'Category:Pasta'; $t = '이탈리아'; $t = '서울특별시'; /* $t = 'Berlin'; */