public function extract($node, $subjectUri, $pageContext) { $this->curl = curl_init(); $url = sprintf(LocalConfiguration::abstractPageUrlFormat, $this->language, $node->getRoot()->getTitle()->encoded()); curl_setopt($this->curl, CURLOPT_URL, $url); curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, true); curl_setopt($this->curl, CURLOPT_BINARYTRANSFER, true); curl_setopt($this->curl, CURLOPT_FAILONERROR, true); $text = curl_exec($this->curl); if (false === $text) { //echo curl_error($this->curl); // TODO /* ('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl)); $this->log('error', $this->getExtractorID().' call to '.$url. ' failed'); $this->log('error', 'Please define AbstractExtractor.page_url_format correctly in your option file, e.g. dbpedia.ini.'); $this->log('error', 'nr '.curl_errno($this->curl). "\n".curl_error($this->curl)); // die($this->getExtractorID().' call to '.$url. ' failed'); echo($this->getExtractorID().' call to '.$url. ' failed'); return new ExtractionResult($pageID, $this->language, $this->getExtractorID()); */ } if (!empty($text)) { $text = StringUtil::htmlDecode($text); $shorttext = $this->short($text); if (!empty($shorttext)) { $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::SHORT_PROPERTY), $shorttext, $node->getSourceUri()); $this->destination->addQuad($quad); } $quad = new RdfQuad($subjectUri, $this->ontology->getProperty(self::LONG_PROPERTY), $text, $node->getSourceUri()); $this->destination->addQuad($quad); } return true; }
public function parse(Node $node) { $result = self::nodeToString($node); //Clean text $result = WikiUtil::removeWikiEmphasis($result); $result = StringUtil::htmlDecode($result); if (strlen($result) > 0) { return $result; } }
public function generate($baseUri, $node) { if (isset($node)) { //Retrieve text $text = $this->nodeToText($node); //Normalize text $text = WikiUtil::removeWikiEmphasis($text); $text = StringUtil::htmlDecode($text); $text = preg_replace('/ +/', ' ', $text); //remove duplicate spaces $text = str_replace('(', ' ', $text); $text = str_replace(')', ' ', $text); $text = strip_tags($text); $text = substr($text, 0, 50); $text = trim($text); $text = str_replace(' ', '_', $text); $text = urlencode($text); //Test if the base URI ends with a prefix of text $baseLen = strlen($baseUri); $textLen = strlen($text); for ($i = $baseLen - 1; $i > 0 && $baseLen - $i < $textLen; $i--) { if (substr_compare($baseUri, $text, $i, $textLen, true) === 0) { $text = substr($text, $baseLen - $i); break; } } //Remove leading underscore if (!empty($text) && $text[0] === '_') { $text = substr($text, 1); } //Generate URI $uri = $baseUri . '__' . $text; } else { $uri = $baseUri; } //Resolve collisions if (!isset($this->uris[$uri])) { //No collision $this->uris[$uri] = 1; } else { //Collision found $index = $this->uris[$uri]; $this->uris[$uri] = $index + 1; $uri .= '__' . $index; } return $uri; }