public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $image_ar = $this->extract_image_url($pageSource); $image = ucfirst($image_ar[0]); $width = $image_ar[1]; if ($image == null) { return $result; } $ImageURL = $this->make_image_url($image, false, true); $ImageURLSmall = $this->make_image_url($image, $width); $image = str_replace(" ", "_", trim($image)); if (!URI::validate($ImageURL) || !URI::validate($ImageURLSmall)) { return $result; } // Add fullsize image $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(FOAF_DEPICTION), RDFtriple::URI($ImageURL)); // Add depiction has thumbnail $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(FOAF_THUMBNAIL), RDFtriple::URI($ImageURLSmall)); // Add object has thumbnail $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DBO_THUMBNAIL), RDFtriple::URI($ImageURLSmall)); // add triples linking back to the Wikipedia image description $image = urlencode($image); $wikipediaImageDescription = 'http://' . $this->language . '.wikipedia.org/wiki/Image:' . $image; $result->addTriple(RDFtriple::URI($ImageURLSmall), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription)); $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); // Add fullsize image $ImageURL = $this->extract_image_url($pageSource, $pageTitle); if ($ImageURL == null || !URI::validate($ImageURL)) { return $result; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/depiction"), RDFtriple::URI($ImageURL)); return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, self::extractorID); $extlinks = $this->extract_external_links($pageSource, $this->language); while (list($ExtURL, $ExtName) = each($extlinks)) { if (!URI::validate($ExtURL)) { continue; } $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL)); } return $result; }
public function extractPage($pageID, $pageTitle, $pageSource) { $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID()); $extlinks = $this->extract_external_links($pageSource, $this->language); while (list($ExtURL, $ExtName) = each($extlinks)) { // Replace single quotes with %27 $ExtURL = str_replace("'", "%27", $ExtURL); $ExtURL = str_replace("\\", "\\\\", $ExtURL); if (!URI::validate($ExtURL)) { continue; } $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_REFERENCE, false), RDFtriple::URI($ExtURL)); } return $result; }
/** * Writes the triple + additional information such as language, whether an object is a reference * or a literal and the datatype into a global array ($parseResult) * * @param subject: String containing the triples subject * @param predicate: String containing the triples predicate * @param object: String containing the triples object * @param file: Legacy, should be removed in the future * @param object_is: 'r' if object is a reference, 'l' if object is a literal, 'b' if object is a blanknode * @param dtype: String containing a literals XS D:datatype * @param lang: String containing a literals language * * TODO: Should encodeLocalName be used for the whole URL? Should URI objects be used? * */ function writeTripel($subject, $predicate, $object, $file = 'main', $object_is = 'r', $dtype = NULL, $lang = NULL) { global $parseResult; if ($object_is == 'r' && !URI::validate(encodeLocalName($object))) { return null; } // If $object_is == 'l', encodeLocalName shouldn't be used, the string will be encoded like e.g. \uBC18\uC57C if ($object_is != 'l') { $object = encodeLocalName($object); } $predicate = encodeLocalName($predicate); if (USE_PERCENT_ENCODING) { $predicate = str_replace("%", "_percent_", $predicate); } else { if (ereg("%([A-F0-9]{2})", substr($predicate, -3))) { $predicate .= "_"; } } $parseResult[] = array(encodeLocalName($subject), $predicate, $object, $object_is, $dtype, $lang); }
/** * Tries to convert link formats found in Wiki source to plain URLs * * @param $link Link entry from in Wiki source (various formats possible) * @return Plain URL or null */ private function parseLink($link) { /* * Some template values are 'None', 'unknown' etc., which would be converted to 'http://None' * below. We simply reject URLs that don't contain a single '.' (and hope that no one uses * 'None.' or '...') */ if (strpos($link, '.') === false) { return null; } /* * URLs may be provided in raw form within templates (website = http://hu-berlin.de) * or even without http prefix (Website = www.alabama.gov) */ foreach (array($link, "http://" . $link) as $variant) { if (URI::validate($variant)) { return $variant; } } // match external link using normal wiki syntax if (preg_match('~\\[(https?://\\S+)\\s?([^]]+)?\\]~i', $link, $pieces)) { $url = $pieces[1]; if (count($pieces) == 3) { $title = $pieces[2]; // Try to find nice base URL: if the link title looks like it contains a host name, // and the link title is contained in the URL, we use the link title. This cuts of // '/index.html' cruft. TODO: But we may cut off important stuff... if (preg_match('/\\w+\\.\\w+/', $title) && stristr($url, $title) !== false) { /* TBD: Add 'www' prefix, if not provided? */ $url = "http://" . strtolower($title); } } if (URI::validate($url)) { return $url; } } return null; }
/** * Tries to convert link formats found in Wiki source to plain URLs * * @param $link Link entry from in Wiki source (various formats possible) * @param $guessRoot Whether a title providing the link's domain root overrides the link, * e.g. take www.microsoft.com when given "[http://www.microsoft.com/worldwide/ www.microsoft.com]" * @return Plain URL or null */ private function parseURL($link, $guessRoot = true) { /* * URLs may be provided in raw form within templates (website = http://hu-berlin.de) * or even without http prefix (Website = www.alabama.gov) */ foreach (array($link, "http://" . $link) as $variant) { if (URI::validate($variant)) { return $variant; } } if (!preg_match('~\\[(http(?:s)?://[^ ]+)\\s?([^]]+)?\\]~i', $link, $pieces)) { return null; } /* * [1]: URL * [2]: Link title (optional) */ if ($guessRoot && count($pieces) == 3 && preg_match('/\\w+\\.\\w+/', $pieces[2]) && stristr($pieces[1], $pieces[2]) !== false) { /* TBD: Add 'www' prefix, if not provided? */ return "http://" . strtolower($pieces[2]); } else { return $pieces[1]; } }
/** * Writes the triple + additional information such as language, whether an object is a reference * or a literal and the datatype into a global array ($parseResult) * * @param subject: String containing the triples subject * @param predicate: String containing the triples predicate * @param object: String containing the triples object * @param file: Legacy, should be removed in the future * @param object_is: 'r' if object is a reference, 'l' if object is a literalm 'b' if object is a blanknode * @param dtype: String containing a literals XSD:datatype * @param lang: String containing a literals language * */ function writeTripel($subject, $predicate, $object, $file = 'main', $object_is = 'r', $dtype = NULL, $lang = NULL) { global $parseResult; if ($object_is == 'r' && !URI::validate($object)) { return null; } $predicate = str_replace("%", "_percent_", $predicate); $parseResult[] = array($subject, $predicate, $object, $object_is, $dtype, $lang); }