function parse_event($h) { require_once 'include/Scrape.php'; require_once 'library/HTMLPurifier.auto.php'; require_once 'include/html2bbcode'; $h = '<html><body>' . $h . '</body></html>'; $ret = array(); try { $dom = HTML5_Parser::parse($h); } catch (DOMException $e) { logger('parse_event: parse error: ' . $e); } if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('*'); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vevent')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'dtstart') && $x->getAttribute('title')) { $ret['start'] = $x->getAttribute('title'); if (!strpos($ret['start'], 'Z')) { $ret['adjust'] = true; } } if (attribute_contains($x->getAttribute('class'), 'dtend') && $x->getAttribute('title')) { $ret['finish'] = $x->getAttribute('title'); } if (attribute_contains($x->getAttribute('class'), 'description')) { $ret['desc'] = $x->textContent; } if (attribute_contains($x->getAttribute('class'), 'location')) { $ret['location'] = $x->textContent; } } } } // sanitise if (x($ret, 'desc') && (strpos($ret['desc'], '<') !== false || strpos($ret['desc'], '>') !== false)) { $config = HTMLPurifier_Config::createDefault(); $config->set('Cache.DefinitionImpl', null); $purifier = new HTMLPurifier($config); $ret['desc'] = html2bbcode($purifier->purify($ret['desc'])); } if (x($ret, 'location') && (strpos($ret['location'], '<') !== false || strpos($ret['location'], '>') !== false)) { $config = HTMLPurifier_Config::createDefault(); $config->set('Cache.DefinitionImpl', null); $purifier = new HTMLPurifier($config); $ret['location'] = html2bbcode($purifier->purify($ret['location'])); } if (x($ret, 'start')) { $ret['start'] = datetime_convert('UTC', 'UTC', $ret['start']); } if (x($ret, 'finish')) { $ret['finish'] = datetime_convert('UTC', 'UTC', $ret['finish']); } return $ret; }
function scrape_dfrn($url) { $ret = array(); $s = fetch_url($url); if (!$s) { return $ret; } $dom = HTML5_Parser::parse($s); if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('link'); // get DFRN link elements foreach ($items as $item) { $x = $item->getAttribute('rel'); if (substr($x, 0, 5) == "dfrn-") { $ret[$x] = $item->getAttribute('href'); } } // Pull out hCard profile elements $items = $dom->getElementsByTagName('*'); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vcard')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'fn')) { $ret['fn'] = $x->textContent; } if (attribute_contains($x->getAttribute('class'), 'photo')) { $ret['photo'] = $x->getAttribute('src'); } if (attribute_contains($x->getAttribute('class'), 'key')) { $ret['key'] = $x->textContent; } } } } return $ret; }
function scrape_vcard($url) { $a = get_app(); $ret = array(); logger('scrape_vcard: url=' . $url); $x = z_fetch_url($url); if (!$x['success']) { return $ret; } $s = $x['body']; if (!$s) { return $ret; } $headers = $x['header']; $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { // don't try and run feeds through the html5 parser if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) { return ret; } } } try { $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_vcard: parse error: ' . $e); } if (!$dom) { return $ret; } // Pull out hCard profile elements $largest_photo = 0; $items = $dom->getElementsByTagName('*'); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vcard')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'fn')) { $ret['fn'] = $x->textContent; } if (attribute_contains($x->getAttribute('class'), 'photo') || attribute_contains($x->getAttribute('class'), 'avatar')) { $size = intval($x->getAttribute('width')); if ($size > $largest_photo || !$largest_photo) { $ret['photo'] = $x->getAttribute('src'); $largest_photo = $size; } } if (attribute_contains($x->getAttribute('class'), 'nickname') || attribute_contains($x->getAttribute('class'), 'uid')) { $ret['nick'] = $x->textContent; } } } } return $ret; }
function lrdd($uri) { $a = get_app(); // default priority is host priority, host-meta first $priority = 'host'; // All we have is an email address. Resource-priority is irrelevant // because our URI isn't directly resolvable. if (strstr($uri, '@')) { return webfinger($uri); } // get the host meta file $host = @parse_url($uri); if ($host) { $url = (x($host, 'scheme') ? $host['scheme'] : 'http') . '://'; $url .= $host['host'] . '/.well-known/host-meta'; } else { return array(); } logger('lrdd: constructed url: ' . $url); $xml = fetch_url($url); $headers = $a->get_curl_headers(); if (!$xml) { return array(); } logger('lrdd: host_meta: ' . $xml, LOGGER_DATA); $h = parse_xml_string($xml); if (!$h) { return array(); } $arr = convert_xml_element_to_array($h); if (isset($arr['xrd']['property'])) { $property = $arr['crd']['property']; if (!isset($property[0])) { $properties = array($property); } else { $properties = $property; } foreach ($properties as $prop) { if ((string) $prop['@attributes'] === 'http://lrdd.net/priority/resource') { $priority = 'resource'; } } } // save the links in case we need them $links = array(); if (isset($arr['xrd']['link'])) { $link = $arr['xrd']['link']; if (!isset($link[0])) { $links = array($link); } else { $links = $link; } } // do we have a template or href? if (count($links)) { foreach ($links as $link) { if ($link['@attributes']['rel'] && attribute_contains($link['@attributes']['rel'], 'lrdd')) { if (x($link['@attributes'], 'template')) { $tpl = $link['@attributes']['template']; } elseif (x($link['@attributes'], 'href')) { $href = $link['@attributes']['href']; } } } } if (!isset($tpl) || !strpos($tpl, '{uri}')) { $tpl = ''; } if ($priority === 'host') { if (strlen($tpl)) { $pxrd = str_replace('{uri}', urlencode($uri), $tpl); } elseif (isset($href)) { $pxrd = $href; } if (isset($pxrd)) { logger('lrdd: (host priority) pxrd: ' . $pxrd); $links = fetch_xrd_links($pxrd); return $links; } $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) { return fetch_xrd_links($matches[1]); break; } } } } // priority 'resource' $html = fetch_url($uri); $headers = $a->get_curl_headers(); logger('lrdd: headers=' . $headers, LOGGER_DEBUG); // don't try and parse raw xml as html if (!strstr($html, '<?xml')) { require_once 'library/HTML5/Parser.php'; try { $dom = HTML5_Parser::parse($html); } catch (DOMException $e) { logger('lrdd: parse error: ' . $e); } if ($dom) { $items = $dom->getElementsByTagName('link'); foreach ($items as $item) { $x = $item->getAttribute('rel'); if ($x == "lrdd") { $pagelink = $item->getAttribute('href'); break; } } } } if (isset($pagelink)) { return fetch_xrd_links($pagelink); } // next look in HTTP headers $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { // TODO alter the following regex to support multiple relations (space separated) if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) { $pagelink = $matches[1]; break; } // don't try and run feeds through the html5 parser if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) { return array(); } if (stristr($html, '<rss') || stristr($html, '<feed')) { return array(); } } } if (isset($pagelink)) { return fetch_xrd_links($pagelink); } // If we haven't found any links, return the host xrd links (which we have already fetched) if (isset($links)) { return $links; } return array(); }
function scrape_dfrn($url, $max_nodes = 3500) { $minNodes = 100; //Lets do at least 100 nodes per type. $timeout = 10; //Timeout will affect batch processing. //Try and cheat our way into faster profiles. if (strpos($url, 'tab=profile') === false) { $url .= (strpos($url, '?') > 0 ? '&' : '?') . 'tab=profile'; } $scrape_start = microtime(true); $ret = array(); $s = fetch_url($url, $timeout); $scrape_fetch_end = microtime(true); if (!$s) { return $ret; } $dom = HTML5_Parser::parse($s); if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('meta'); // get DFRN link elements $nodes_left = max(intval($max_nodes), $minNodes); $targets = array('hide', 'comm', 'tags'); $targets_left = count($targets); foreach ($items as $item) { $x = $item->getAttribute('name'); if ($x == 'dfrn-global-visibility') { $z = strtolower(trim($item->getAttribute('content'))); if ($z != 'true') { $ret['hide'] = 1; } if ($z === 'false') { $ret['explicit-hide'] = 1; } $targets_left = pop_scrape_target($targets, 'hide'); } if ($x == 'friendika.community' || $x == 'friendica.community') { $z = strtolower(trim($item->getAttribute('content'))); if ($z == 'true') { $ret['comm'] = 1; } $targets_left = pop_scrape_target($targets, 'comm'); } if ($x == 'keywords') { $z = str_replace(',', ' ', strtolower(trim($item->getAttribute('content')))); if (strlen($z)) { $ret['tags'] = $z; } $targets_left = pop_scrape_target($targets, 'tags'); } $nodes_left--; if ($nodes_left <= 0 || $targets_left <= 0) { break; } } $items = $dom->getElementsByTagName('link'); // get DFRN link elements $nodes_left = max(intval($max_nodes), $minNodes); foreach ($items as $item) { $x = $item->getAttribute('rel'); if (substr($x, 0, 5) == "dfrn-") { $ret[$x] = $item->getAttribute('href'); } $nodes_left--; if ($nodes_left <= 0) { break; } } // Pull out hCard profile elements $nodes_left = max(intval($max_nodes), $minNodes); $items = $dom->getElementsByTagName('*'); $targets = array('fn', 'pdesc', 'photo', 'key', 'locality', 'region', 'postal-code', 'country-name', 'gender', 'marital'); $targets_left = count($targets); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vcard')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'fn')) { $ret['fn'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'fn'); } if (attribute_contains($x->getAttribute('class'), 'title')) { $ret['pdesc'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'pdesc'); } if (attribute_contains($x->getAttribute('class'), 'photo')) { $ret['photo'] = $x->getAttribute('src'); $targets_left = pop_scrape_target($targets, 'photo'); } if (attribute_contains($x->getAttribute('class'), 'key')) { $ret['key'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'key'); } if (attribute_contains($x->getAttribute('class'), 'locality')) { $ret['locality'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'locality'); } if (attribute_contains($x->getAttribute('class'), 'region')) { $ret['region'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'region'); } if (attribute_contains($x->getAttribute('class'), 'postal-code')) { $ret['postal-code'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'postal-code'); } if (attribute_contains($x->getAttribute('class'), 'country-name')) { $ret['country-name'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'country-name'); } if (attribute_contains($x->getAttribute('class'), 'x-gender')) { $ret['gender'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'gender'); } } } if (attribute_contains($item->getAttribute('class'), 'marital-text')) { $ret['marital'] = $item->textContent; $targets_left = pop_scrape_target($targets, 'marital'); } $nodes_left--; if ($nodes_left <= 0 || $targets_left <= 0) { break; } } $scrape_end = microtime(true); $fetch_time = round(($scrape_fetch_end - $scrape_start) * 1000); $scrape_time = round(($scrape_end - $scrape_fetch_end) * 1000); $ret['_timings'] = array('fetch' => $fetch_time, 'scrape' => $scrape_time); return $ret; }
/** * test input with special chars */ public function testAttributeContainsSpecialChars() { $testAttr = "--... %\$ä() /(=?}"; $this->assertFalse(attribute_contains($testAttr, "class2")); }