Esempio n. 1
0
function parse_event($h)
{
    require_once 'include/Scrape.php';
    require_once 'library/HTMLPurifier.auto.php';
    require_once 'include/html2bbcode';
    $h = '<html><body>' . $h . '</body></html>';
    $ret = array();
    try {
        $dom = HTML5_Parser::parse($h);
    } catch (DOMException $e) {
        logger('parse_event: parse error: ' . $e);
    }
    if (!$dom) {
        return $ret;
    }
    $items = $dom->getElementsByTagName('*');
    foreach ($items as $item) {
        if (attribute_contains($item->getAttribute('class'), 'vevent')) {
            $level2 = $item->getElementsByTagName('*');
            foreach ($level2 as $x) {
                if (attribute_contains($x->getAttribute('class'), 'dtstart') && $x->getAttribute('title')) {
                    $ret['start'] = $x->getAttribute('title');
                    if (!strpos($ret['start'], 'Z')) {
                        $ret['adjust'] = true;
                    }
                }
                if (attribute_contains($x->getAttribute('class'), 'dtend') && $x->getAttribute('title')) {
                    $ret['finish'] = $x->getAttribute('title');
                }
                if (attribute_contains($x->getAttribute('class'), 'description')) {
                    $ret['desc'] = $x->textContent;
                }
                if (attribute_contains($x->getAttribute('class'), 'location')) {
                    $ret['location'] = $x->textContent;
                }
            }
        }
    }
    // sanitise
    if (x($ret, 'desc') && (strpos($ret['desc'], '<') !== false || strpos($ret['desc'], '>') !== false)) {
        $config = HTMLPurifier_Config::createDefault();
        $config->set('Cache.DefinitionImpl', null);
        $purifier = new HTMLPurifier($config);
        $ret['desc'] = html2bbcode($purifier->purify($ret['desc']));
    }
    if (x($ret, 'location') && (strpos($ret['location'], '<') !== false || strpos($ret['location'], '>') !== false)) {
        $config = HTMLPurifier_Config::createDefault();
        $config->set('Cache.DefinitionImpl', null);
        $purifier = new HTMLPurifier($config);
        $ret['location'] = html2bbcode($purifier->purify($ret['location']));
    }
    if (x($ret, 'start')) {
        $ret['start'] = datetime_convert('UTC', 'UTC', $ret['start']);
    }
    if (x($ret, 'finish')) {
        $ret['finish'] = datetime_convert('UTC', 'UTC', $ret['finish']);
    }
    return $ret;
}
Esempio n. 2
0
 function scrape_dfrn($url)
 {
     $ret = array();
     $s = fetch_url($url);
     if (!$s) {
         return $ret;
     }
     $dom = HTML5_Parser::parse($s);
     if (!$dom) {
         return $ret;
     }
     $items = $dom->getElementsByTagName('link');
     // get DFRN link elements
     foreach ($items as $item) {
         $x = $item->getAttribute('rel');
         if (substr($x, 0, 5) == "dfrn-") {
             $ret[$x] = $item->getAttribute('href');
         }
     }
     // Pull out hCard profile elements
     $items = $dom->getElementsByTagName('*');
     foreach ($items as $item) {
         if (attribute_contains($item->getAttribute('class'), 'vcard')) {
             $level2 = $item->getElementsByTagName('*');
             foreach ($level2 as $x) {
                 if (attribute_contains($x->getAttribute('class'), 'fn')) {
                     $ret['fn'] = $x->textContent;
                 }
                 if (attribute_contains($x->getAttribute('class'), 'photo')) {
                     $ret['photo'] = $x->getAttribute('src');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'key')) {
                     $ret['key'] = $x->textContent;
                 }
             }
         }
     }
     return $ret;
 }
Esempio n. 3
0
function scrape_vcard($url)
{
    $a = get_app();
    $ret = array();
    logger('scrape_vcard: url=' . $url);
    $x = z_fetch_url($url);
    if (!$x['success']) {
        return $ret;
    }
    $s = $x['body'];
    if (!$s) {
        return $ret;
    }
    $headers = $x['header'];
    $lines = explode("\n", $headers);
    if (count($lines)) {
        foreach ($lines as $line) {
            // don't try and run feeds through the html5 parser
            if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) {
                return ret;
            }
        }
    }
    try {
        $dom = HTML5_Parser::parse($s);
    } catch (DOMException $e) {
        logger('scrape_vcard: parse error: ' . $e);
    }
    if (!$dom) {
        return $ret;
    }
    // Pull out hCard profile elements
    $largest_photo = 0;
    $items = $dom->getElementsByTagName('*');
    foreach ($items as $item) {
        if (attribute_contains($item->getAttribute('class'), 'vcard')) {
            $level2 = $item->getElementsByTagName('*');
            foreach ($level2 as $x) {
                if (attribute_contains($x->getAttribute('class'), 'fn')) {
                    $ret['fn'] = $x->textContent;
                }
                if (attribute_contains($x->getAttribute('class'), 'photo') || attribute_contains($x->getAttribute('class'), 'avatar')) {
                    $size = intval($x->getAttribute('width'));
                    if ($size > $largest_photo || !$largest_photo) {
                        $ret['photo'] = $x->getAttribute('src');
                        $largest_photo = $size;
                    }
                }
                if (attribute_contains($x->getAttribute('class'), 'nickname') || attribute_contains($x->getAttribute('class'), 'uid')) {
                    $ret['nick'] = $x->textContent;
                }
            }
        }
    }
    return $ret;
}
Esempio n. 4
0
 function lrdd($uri)
 {
     $a = get_app();
     // default priority is host priority, host-meta first
     $priority = 'host';
     // All we have is an email address. Resource-priority is irrelevant
     // because our URI isn't directly resolvable.
     if (strstr($uri, '@')) {
         return webfinger($uri);
     }
     // get the host meta file
     $host = @parse_url($uri);
     if ($host) {
         $url = (x($host, 'scheme') ? $host['scheme'] : 'http') . '://';
         $url .= $host['host'] . '/.well-known/host-meta';
     } else {
         return array();
     }
     logger('lrdd: constructed url: ' . $url);
     $xml = fetch_url($url);
     $headers = $a->get_curl_headers();
     if (!$xml) {
         return array();
     }
     logger('lrdd: host_meta: ' . $xml, LOGGER_DATA);
     $h = parse_xml_string($xml);
     if (!$h) {
         return array();
     }
     $arr = convert_xml_element_to_array($h);
     if (isset($arr['xrd']['property'])) {
         $property = $arr['crd']['property'];
         if (!isset($property[0])) {
             $properties = array($property);
         } else {
             $properties = $property;
         }
         foreach ($properties as $prop) {
             if ((string) $prop['@attributes'] === 'http://lrdd.net/priority/resource') {
                 $priority = 'resource';
             }
         }
     }
     // save the links in case we need them
     $links = array();
     if (isset($arr['xrd']['link'])) {
         $link = $arr['xrd']['link'];
         if (!isset($link[0])) {
             $links = array($link);
         } else {
             $links = $link;
         }
     }
     // do we have a template or href?
     if (count($links)) {
         foreach ($links as $link) {
             if ($link['@attributes']['rel'] && attribute_contains($link['@attributes']['rel'], 'lrdd')) {
                 if (x($link['@attributes'], 'template')) {
                     $tpl = $link['@attributes']['template'];
                 } elseif (x($link['@attributes'], 'href')) {
                     $href = $link['@attributes']['href'];
                 }
             }
         }
     }
     if (!isset($tpl) || !strpos($tpl, '{uri}')) {
         $tpl = '';
     }
     if ($priority === 'host') {
         if (strlen($tpl)) {
             $pxrd = str_replace('{uri}', urlencode($uri), $tpl);
         } elseif (isset($href)) {
             $pxrd = $href;
         }
         if (isset($pxrd)) {
             logger('lrdd: (host priority) pxrd: ' . $pxrd);
             $links = fetch_xrd_links($pxrd);
             return $links;
         }
         $lines = explode("\n", $headers);
         if (count($lines)) {
             foreach ($lines as $line) {
                 if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) {
                     return fetch_xrd_links($matches[1]);
                     break;
                 }
             }
         }
     }
     // priority 'resource'
     $html = fetch_url($uri);
     $headers = $a->get_curl_headers();
     logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
     // don't try and parse raw xml as html
     if (!strstr($html, '<?xml')) {
         require_once 'library/HTML5/Parser.php';
         try {
             $dom = HTML5_Parser::parse($html);
         } catch (DOMException $e) {
             logger('lrdd: parse error: ' . $e);
         }
         if ($dom) {
             $items = $dom->getElementsByTagName('link');
             foreach ($items as $item) {
                 $x = $item->getAttribute('rel');
                 if ($x == "lrdd") {
                     $pagelink = $item->getAttribute('href');
                     break;
                 }
             }
         }
     }
     if (isset($pagelink)) {
         return fetch_xrd_links($pagelink);
     }
     // next look in HTTP headers
     $lines = explode("\n", $headers);
     if (count($lines)) {
         foreach ($lines as $line) {
             // TODO alter the following regex to support multiple relations (space separated)
             if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) {
                 $pagelink = $matches[1];
                 break;
             }
             // don't try and run feeds through the html5 parser
             if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) {
                 return array();
             }
             if (stristr($html, '<rss') || stristr($html, '<feed')) {
                 return array();
             }
         }
     }
     if (isset($pagelink)) {
         return fetch_xrd_links($pagelink);
     }
     // If we haven't found any links, return the host xrd links (which we have already fetched)
     if (isset($links)) {
         return $links;
     }
     return array();
 }
Esempio n. 5
0
 function scrape_dfrn($url, $max_nodes = 3500)
 {
     $minNodes = 100;
     //Lets do at least 100 nodes per type.
     $timeout = 10;
     //Timeout will affect batch processing.
     //Try and cheat our way into faster profiles.
     if (strpos($url, 'tab=profile') === false) {
         $url .= (strpos($url, '?') > 0 ? '&' : '?') . 'tab=profile';
     }
     $scrape_start = microtime(true);
     $ret = array();
     $s = fetch_url($url, $timeout);
     $scrape_fetch_end = microtime(true);
     if (!$s) {
         return $ret;
     }
     $dom = HTML5_Parser::parse($s);
     if (!$dom) {
         return $ret;
     }
     $items = $dom->getElementsByTagName('meta');
     // get DFRN link elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     $targets = array('hide', 'comm', 'tags');
     $targets_left = count($targets);
     foreach ($items as $item) {
         $x = $item->getAttribute('name');
         if ($x == 'dfrn-global-visibility') {
             $z = strtolower(trim($item->getAttribute('content')));
             if ($z != 'true') {
                 $ret['hide'] = 1;
             }
             if ($z === 'false') {
                 $ret['explicit-hide'] = 1;
             }
             $targets_left = pop_scrape_target($targets, 'hide');
         }
         if ($x == 'friendika.community' || $x == 'friendica.community') {
             $z = strtolower(trim($item->getAttribute('content')));
             if ($z == 'true') {
                 $ret['comm'] = 1;
             }
             $targets_left = pop_scrape_target($targets, 'comm');
         }
         if ($x == 'keywords') {
             $z = str_replace(',', ' ', strtolower(trim($item->getAttribute('content'))));
             if (strlen($z)) {
                 $ret['tags'] = $z;
             }
             $targets_left = pop_scrape_target($targets, 'tags');
         }
         $nodes_left--;
         if ($nodes_left <= 0 || $targets_left <= 0) {
             break;
         }
     }
     $items = $dom->getElementsByTagName('link');
     // get DFRN link elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     foreach ($items as $item) {
         $x = $item->getAttribute('rel');
         if (substr($x, 0, 5) == "dfrn-") {
             $ret[$x] = $item->getAttribute('href');
         }
         $nodes_left--;
         if ($nodes_left <= 0) {
             break;
         }
     }
     // Pull out hCard profile elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     $items = $dom->getElementsByTagName('*');
     $targets = array('fn', 'pdesc', 'photo', 'key', 'locality', 'region', 'postal-code', 'country-name', 'gender', 'marital');
     $targets_left = count($targets);
     foreach ($items as $item) {
         if (attribute_contains($item->getAttribute('class'), 'vcard')) {
             $level2 = $item->getElementsByTagName('*');
             foreach ($level2 as $x) {
                 if (attribute_contains($x->getAttribute('class'), 'fn')) {
                     $ret['fn'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'fn');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'title')) {
                     $ret['pdesc'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'pdesc');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'photo')) {
                     $ret['photo'] = $x->getAttribute('src');
                     $targets_left = pop_scrape_target($targets, 'photo');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'key')) {
                     $ret['key'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'key');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'locality')) {
                     $ret['locality'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'locality');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'region')) {
                     $ret['region'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'region');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'postal-code')) {
                     $ret['postal-code'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'postal-code');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'country-name')) {
                     $ret['country-name'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'country-name');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'x-gender')) {
                     $ret['gender'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'gender');
                 }
             }
         }
         if (attribute_contains($item->getAttribute('class'), 'marital-text')) {
             $ret['marital'] = $item->textContent;
             $targets_left = pop_scrape_target($targets, 'marital');
         }
         $nodes_left--;
         if ($nodes_left <= 0 || $targets_left <= 0) {
             break;
         }
     }
     $scrape_end = microtime(true);
     $fetch_time = round(($scrape_fetch_end - $scrape_start) * 1000);
     $scrape_time = round(($scrape_end - $scrape_fetch_end) * 1000);
     $ret['_timings'] = array('fetch' => $fetch_time, 'scrape' => $scrape_time);
     return $ret;
 }
 /**
  * test input with special chars
  */
 public function testAttributeContainsSpecialChars()
 {
     $testAttr = "--... %\$ä() /(=?}";
     $this->assertFalse(attribute_contains($testAttr, "class2"));
 }