Example #1
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $image_ar = $this->extract_image_url($pageSource);
     $image = ucfirst($image_ar[0]);
     $width = $image_ar[1];
     if ($image == null) {
         return $result;
     }
     $ImageURL = $this->make_image_url($image, false, true);
     $ImageURLSmall = $this->make_image_url($image, $width);
     $image = str_replace(" ", "_", trim($image));
     if (!URI::validate($ImageURL) || !URI::validate($ImageURLSmall)) {
         return $result;
     }
     // Add fullsize image
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(FOAF_DEPICTION), RDFtriple::URI($ImageURL));
     // Add depiction has thumbnail
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(FOAF_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // Add object has thumbnail
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI(DBO_THUMBNAIL), RDFtriple::URI($ImageURLSmall));
     // add triples linking back to the Wikipedia image description
     $image = urlencode($image);
     $wikipediaImageDescription = 'http://' . $this->language . '.wikipedia.org/wiki/Image:' . $image;
     $result->addTriple(RDFtriple::URI($ImageURLSmall), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     $result->addTriple(RDFtriple::URI($ImageURL), RDFtriple::URI(DC_RIGHTS), RDFtriple::URI($wikipediaImageDescription));
     return $result;
 }
Example #2
0
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     // Add fullsize image
     $ImageURL = $this->extract_image_url($pageSource, $pageTitle);
     if ($ImageURL == null || !URI::validate($ImageURL)) {
         return $result;
     }
     $result->addTriple(RDFtriple::page($pageID), RDFtriple::URI("http://xmlns.com/foaf/0.1/depiction"), RDFtriple::URI($ImageURL));
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, self::extractorID);
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple(RDFtriple::page($pageID), RDFtriple::predicate("reference"), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
 public function extractPage($pageID, $pageTitle, $pageSource)
 {
     $result = new ExtractionResult($pageID, $this->language, $this->getExtractorID());
     $extlinks = $this->extract_external_links($pageSource, $this->language);
     while (list($ExtURL, $ExtName) = each($extlinks)) {
         // Replace single quotes with %27
         $ExtURL = str_replace("'", "%27", $ExtURL);
         $ExtURL = str_replace("\\", "\\\\", $ExtURL);
         if (!URI::validate($ExtURL)) {
             continue;
         }
         $result->addTriple($this->getPageURI(), RDFtriple::URI(DB_REFERENCE, false), RDFtriple::URI($ExtURL));
     }
     return $result;
 }
Example #5
0
/**
 * Writes the triple + additional information such as language, whether an object is a reference
 * or a literal and the datatype into a global array ($parseResult)
 *
 * @param subject: String containing the triples subject
 * @param predicate: String containing the triples predicate
 * @param object: String containing the triples object
 * @param file: Legacy, should be removed in the future
 * @param object_is: 'r' if object is a reference, 'l' if object is a literal, 'b' if object is a blanknode
 * @param dtype: String containing a literals XS D:datatype
 * @param lang: String containing a literals language
 *
 * TODO: Should encodeLocalName be used for the whole URL? Should URI objects be used?
 *
 */
function writeTripel($subject, $predicate, $object, $file = 'main', $object_is = 'r', $dtype = NULL, $lang = NULL)
{
    global $parseResult;
    if ($object_is == 'r' && !URI::validate(encodeLocalName($object))) {
        return null;
    }
    // If $object_is == 'l', encodeLocalName shouldn't be used, the string will be encoded like e.g. \uBC18\uC57C
    if ($object_is != 'l') {
        $object = encodeLocalName($object);
    }
    $predicate = encodeLocalName($predicate);
    if (USE_PERCENT_ENCODING) {
        $predicate = str_replace("%", "_percent_", $predicate);
    } else {
        if (ereg("%([A-F0-9]{2})", substr($predicate, -3))) {
            $predicate .= "_";
        }
    }
    $parseResult[] = array(encodeLocalName($subject), $predicate, $object, $object_is, $dtype, $lang);
}
Example #6
0
 /**
  * Tries to convert link formats found in Wiki source to plain URLs
  * 
  * @param   $link       Link entry from in Wiki source (various formats possible)
  * @return  Plain URL or null
  */
 private function parseLink($link)
 {
     /*
      * Some template values are 'None', 'unknown' etc., which would be converted to 'http://None'
      * below. We simply reject URLs that don't contain a single '.' (and hope that no one uses 
      * 'None.' or '...')
      */
     if (strpos($link, '.') === false) {
         return null;
     }
     /*
      * URLs may be provided in raw form within templates (website = http://hu-berlin.de) 
      * or even without http prefix (Website = www.alabama.gov)
      */
     foreach (array($link, "http://" . $link) as $variant) {
         if (URI::validate($variant)) {
             return $variant;
         }
     }
     // match external link using normal wiki syntax
     if (preg_match('~\\[(https?://\\S+)\\s?([^]]+)?\\]~i', $link, $pieces)) {
         $url = $pieces[1];
         if (count($pieces) == 3) {
             $title = $pieces[2];
             // Try to find nice base URL: if the link title looks like it contains a host name,
             // and the link title is contained in the URL, we use the link title. This cuts of
             // '/index.html' cruft. TODO: But we may cut off important stuff...
             if (preg_match('/\\w+\\.\\w+/', $title) && stristr($url, $title) !== false) {
                 /* TBD: Add 'www' prefix, if not provided? */
                 $url = "http://" . strtolower($title);
             }
         }
         if (URI::validate($url)) {
             return $url;
         }
     }
     return null;
 }
Example #7
0
 /**
  * Tries to convert link formats found in Wiki source to plain URLs
  * 
  * @param   $link       Link entry from in Wiki source (various formats possible)
  * @param   $guessRoot  Whether a title providing the link's domain root overrides the link,
  *                      e.g. take www.microsoft.com when given "[http://www.microsoft.com/worldwide/ www.microsoft.com]"
  * @return  Plain URL or null
  */
 private function parseURL($link, $guessRoot = true)
 {
     /*
      * URLs may be provided in raw form within templates (website = http://hu-berlin.de) 
      * or even without http prefix (Website = www.alabama.gov)
      */
     foreach (array($link, "http://" . $link) as $variant) {
         if (URI::validate($variant)) {
             return $variant;
         }
     }
     if (!preg_match('~\\[(http(?:s)?://[^ ]+)\\s?([^]]+)?\\]~i', $link, $pieces)) {
         return null;
     }
     /*
      * [1]: URL
      * [2]: Link title (optional)
      */
     if ($guessRoot && count($pieces) == 3 && preg_match('/\\w+\\.\\w+/', $pieces[2]) && stristr($pieces[1], $pieces[2]) !== false) {
         /* TBD: Add 'www' prefix, if not provided? */
         return "http://" . strtolower($pieces[2]);
     } else {
         return $pieces[1];
     }
 }
Example #8
0
/**
 * Writes the triple + additional information such as language, whether an object is a reference
 * or a literal and the datatype into a global array ($parseResult)
 * 
 * @param subject: String containing the triples subject
 * @param predicate: String containing the triples predicate
 * @param object: String containing the triples object
 * @param file: Legacy, should be removed in the future
 * @param object_is: 'r' if object is a reference, 'l' if object is a literalm 'b' if object is a blanknode
 * @param dtype: String containing a literals XSD:datatype
 * @param lang: String containing a literals language
 * 
 */
function writeTripel($subject, $predicate, $object, $file = 'main', $object_is = 'r', $dtype = NULL, $lang = NULL)
{
    global $parseResult;
    if ($object_is == 'r' && !URI::validate($object)) {
        return null;
    }
    $predicate = str_replace("%", "_percent_", $predicate);
    $parseResult[] = array($subject, $predicate, $object, $object_is, $dtype, $lang);
}