function parse_event($h) { require_once 'include/Scrape.php'; require_once 'library/HTMLPurifier.auto.php'; require_once 'include/html2bbcode'; $h = '<html><body>' . $h . '</body></html>'; $ret = array(); try { $dom = HTML5_Parser::parse($h); } catch (DOMException $e) { logger('parse_event: parse error: ' . $e); } if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('*'); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vevent')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'dtstart') && $x->getAttribute('title')) { $ret['start'] = $x->getAttribute('title'); if (!strpos($ret['start'], 'Z')) { $ret['adjust'] = true; } } if (attribute_contains($x->getAttribute('class'), 'dtend') && $x->getAttribute('title')) { $ret['finish'] = $x->getAttribute('title'); } if (attribute_contains($x->getAttribute('class'), 'description')) { $ret['desc'] = $x->textContent; } if (attribute_contains($x->getAttribute('class'), 'location')) { $ret['location'] = $x->textContent; } } } } // sanitise if (x($ret, 'desc') && (strpos($ret['desc'], '<') !== false || strpos($ret['desc'], '>') !== false)) { $config = HTMLPurifier_Config::createDefault(); $config->set('Cache.DefinitionImpl', null); $purifier = new HTMLPurifier($config); $ret['desc'] = html2bbcode($purifier->purify($ret['desc'])); } if (x($ret, 'location') && (strpos($ret['location'], '<') !== false || strpos($ret['location'], '>') !== false)) { $config = HTMLPurifier_Config::createDefault(); $config->set('Cache.DefinitionImpl', null); $purifier = new HTMLPurifier($config); $ret['location'] = html2bbcode($purifier->purify($ret['location'])); } if (x($ret, 'start')) { $ret['start'] = datetime_convert('UTC', 'UTC', $ret['start']); } if (x($ret, 'finish')) { $ret['finish'] = datetime_convert('UTC', 'UTC', $ret['finish']); } return $ret; }
function parse_url_content(&$a) { $url = trim($_GET['url']); $template = "<a href=\"%s\" >%s</a>%s"; if ($url) { $s = fetch_url($url); } else { echo ''; killme(); } if (!$s) { echo sprintf($template, $url, $url, ''); killme(); } $dom = HTML5_Parser::parse($s); if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('title'); if ($items) { foreach ($items as $item) { $title = $item->textContent; break; } } $items = $dom->getElementsByTagName('p'); if ($items) { foreach ($items as $item) { $text = $item->textContent; $text = strip_tags($text); if (strlen($text) < 100) { continue; } $text = substr($text, 0, 250) . '...'; break; } } if (strlen($text)) { $text = '<br />' . $text; } echo sprintf($template, $url, $title, $text); killme(); }
function scrape_meta($url) { $ret = array(); $s = fetch_url($url); if (!$s) { return $ret; } $dom = HTML5_Parser::parse($s); if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('meta'); // get DFRN link elements foreach ($items as $item) { $x = $item->getAttribute('name'); if (substr($x, 0, 5) == "dfrn-") { $ret[$x] = $item->getAttribute('content'); } } return $ret; }
public function setContent($content) { require_once HTML5LIB_PATH . '/HTML5/Parser.php'; // Convert any errors to exceptions set_error_handler(function ($no, $str) { throw new Exception("HTML Parse Error: " . $str); }, error_reporting()); // Use HTML5lib to parse the HTML fragment try { $document = HTML5_Parser::parse('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>' . "<body>{$content}</body></html>"); } catch (Exception $e) { $document = false; } // Disable our error handler (restoring to previous value) restore_error_handler(); // If we couldn't parse the HTML, set the error state if ($document) { $this->setDocument($document); } else { $this->setInvalid(); } }
function parse_url_content(&$a) { $text = null; $str_tags = ''; if (x($_GET, 'binurl')) { $url = trim(hex2bin($_GET['binurl'])); } else { $url = trim($_GET['url']); } if ($_GET['title']) { $title = strip_tags(trim($_GET['title'])); } if ($_GET['description']) { $text = strip_tags(trim($_GET['description'])); } if ($_GET['tags']) { $arr_tags = str_getcsv($_GET['tags']); if (count($arr_tags)) { array_walk($arr_tags, 'arr_add_hashes'); $str_tags = '<br />' . implode(' ', $arr_tags) . '<br />'; } } logger('parse_url: ' . $url); $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />"; $arr = array('url' => $url, 'text' => ''); call_hooks('parse_link', $arr); if (strlen($arr['text'])) { echo $arr['text']; killme(); } if ($url && $title && $text) { $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />'; $title = str_replace(array("\r", "\n"), array('', ''), $title); $result = sprintf($template, $url, $title ? $title : $url, $text) . $str_tags; logger('parse_url (unparsed): returns: ' . $result); echo $result; killme(); } if ($url) { $s = fetch_url($url); } else { echo ''; killme(); } // logger('parse_url: data: ' . $s, LOGGER_DATA); if (!$s) { echo sprintf($template, $url, $url, '') . $str_tags; killme(); } $matches = ''; $c = preg_match('/\\<head(.*?)\\>(.*?)\\<\\/head\\>/ism', $s, $matches); if ($c) { // logger('parse_url: header: ' . $matches[2], LOGGER_DATA); try { $domhead = HTML5_Parser::parse($matches[2]); } catch (DOMException $e) { logger('scrape_dfrn: parse error: ' . $e); } if ($domhead) { logger('parsed header'); } } if (!$title) { if (strpos($s, '<title>')) { $title = substr($s, strpos($s, '<title>') + 7, 64); if (strpos($title, '<') !== false) { $title = strip_tags(substr($title, 0, strpos($title, '<'))); } } } $config = HTMLPurifier_Config::createDefault(); $config->set('Cache.DefinitionImpl', null); $purifier = new HTMLPurifier($config); $s = $purifier->purify($s); // logger('purify_output: ' . $s); try { $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_dfrn: parse error: ' . $e); } if (!$dom) { echo sprintf($template, $url, $url, '') . $str_tags; killme(); } $items = $dom->getElementsByTagName('title'); if ($items) { foreach ($items as $item) { $title = trim($item->textContent); break; } } if (!$text) { $divs = $dom->getElementsByTagName('div'); if ($divs) { foreach ($divs as $div) { $class = $div->getAttribute('class'); if ($class && (stristr($class, 'article') || stristr($class, 'content'))) { $items = $div->getElementsByTagName('p'); if ($items) { foreach ($items as $item) { $text = $item->textContent; if (stristr($text, '<script')) { $text = ''; continue; } $text = strip_tags($text); if (strlen($text) < 100) { $text = ''; continue; } $text = substr($text, 0, 250) . '...'; break; } } } if ($text) { break; } } } if (!$text) { $items = $dom->getElementsByTagName('p'); if ($items) { foreach ($items as $item) { $text = $item->textContent; if (stristr($text, '<script')) { continue; } $text = strip_tags($text); if (strlen($text) < 100) { $text = ''; continue; } $text = substr($text, 0, 250) . '...'; break; } } } } if (!$text) { logger('parsing meta'); $items = $domhead->getElementsByTagName('meta'); if ($items) { foreach ($items as $item) { $property = $item->getAttribute('property'); if ($property && stristr($property, ':description')) { $text = $item->getAttribute('content'); if (stristr($text, '<script')) { $text = ''; continue; } $text = strip_tags($text); $text = substr($text, 0, 250) . '...'; } if ($property && stristr($property, ':image')) { $image = $item->getAttribute('content'); if (stristr($text, '<script')) { $image = ''; continue; } $image = strip_tags($image); $i = fetch_url($image); if ($i) { require_once 'include/Photo.php'; $ph = new Photo($i); if ($ph->is_valid()) { if ($ph->getWidth() > 300 || $ph->getHeight() > 300) { $ph->scaleImage(300); $new_width = $ph->getWidth(); $new_height = $ph->getHeight(); $image = '<br /><br /><img height="' . $new_height . '" width="' . $new_width . '" src="' . $image . '" alt="photo" />'; } else { $image = '<br /><br /><img src="' . $image . '" alt="photo" />'; } } else { $image = ''; } } } } } } if (strlen($text)) { $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />'; } if ($image) { $text = $image . '<br />' . $text; } $title = str_replace(array("\r", "\n"), array('', ''), $title); $result = sprintf($template, $url, $title ? $title : $url, $text) . $str_tags; logger('parse_url: returns: ' . $result); echo $result; killme(); }
/** * Load HTML in a DOMDocument. * Apply Pre filters * Cleanup HTML using Tidy (or not). * * @todo This should be called in init() instead of from __construct */ private function loadHtml() { $this->original_html = $this->html; $this->logger->debug('Parsing URL: ' . $this->url); if ($this->url) { $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/'; } mb_internal_encoding('UTF-8'); mb_http_output('UTF-8'); mb_regex_encoding('UTF-8'); // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { foreach ($this->pre_filters as $search => $replace) { $this->html = preg_replace($search, $replace, $this->html); } unset($search, $replace); } if (trim($this->html) === '') { $this->html = '<html></html>'; } /* * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. */ if ($this->useTidy) { $this->logger->debug('Tidying document'); $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $this->tidied = true; $this->html = $tidy->value; $this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html); } unset($tidy); } $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8'); if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) { libxml_use_internal_errors(true); $this->dom = new \DOMDocument(); $this->dom->preserveWhiteSpace = false; if (PHP_VERSION_ID >= 50400) { $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } else { $this->dom->loadHTML($this->html); } libxml_use_internal_errors(false); } $this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement'); }
/** * Create instance of Readability * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (used for footnotes) * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') */ function __construct($html, $url = null, $parser = 'libxml') { $this->url = $url; /* Turn all double br's into p's */ $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); if (trim($html) == '') { $html = '<html></html>'; } if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) { // all good } else { $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; @$this->dom->loadHTML($html); } $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); }
function lrdd($uri) { $a = get_app(); // default priority is host priority, host-meta first $priority = 'host'; // All we have is an email address. Resource-priority is irrelevant // because our URI isn't directly resolvable. if (strstr($uri, '@')) { return webfinger($uri); } // get the host meta file $host = @parse_url($uri); if ($host) { $url = (x($host, 'scheme') ? $host['scheme'] : 'http') . '://'; $url .= $host['host'] . '/.well-known/host-meta'; } else { return array(); } logger('lrdd: constructed url: ' . $url); $xml = fetch_url($url); $headers = $a->get_curl_headers(); if (!$xml) { return array(); } logger('lrdd: host_meta: ' . $xml, LOGGER_DATA); $h = parse_xml_string($xml); if (!$h) { return array(); } $arr = convert_xml_element_to_array($h); if (isset($arr['xrd']['property'])) { $property = $arr['crd']['property']; if (!isset($property[0])) { $properties = array($property); } else { $properties = $property; } foreach ($properties as $prop) { if ((string) $prop['@attributes'] === 'http://lrdd.net/priority/resource') { $priority = 'resource'; } } } // save the links in case we need them $links = array(); if (isset($arr['xrd']['link'])) { $link = $arr['xrd']['link']; if (!isset($link[0])) { $links = array($link); } else { $links = $link; } } // do we have a template or href? if (count($links)) { foreach ($links as $link) { if ($link['@attributes']['rel'] && attribute_contains($link['@attributes']['rel'], 'lrdd')) { if (x($link['@attributes'], 'template')) { $tpl = $link['@attributes']['template']; } elseif (x($link['@attributes'], 'href')) { $href = $link['@attributes']['href']; } } } } if (!isset($tpl) || !strpos($tpl, '{uri}')) { $tpl = ''; } if ($priority === 'host') { if (strlen($tpl)) { $pxrd = str_replace('{uri}', urlencode($uri), $tpl); } elseif (isset($href)) { $pxrd = $href; } if (isset($pxrd)) { logger('lrdd: (host priority) pxrd: ' . $pxrd); $links = fetch_xrd_links($pxrd); return $links; } $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) { return fetch_xrd_links($matches[1]); break; } } } } // priority 'resource' $html = fetch_url($uri); $headers = $a->get_curl_headers(); logger('lrdd: headers=' . $headers, LOGGER_DEBUG); // don't try and parse raw xml as html if (!strstr($html, '<?xml')) { require_once 'library/HTML5/Parser.php'; try { $dom = HTML5_Parser::parse($html); } catch (DOMException $e) { logger('lrdd: parse error: ' . $e); } if ($dom) { $items = $dom->getElementsByTagName('link'); foreach ($items as $item) { $x = $item->getAttribute('rel'); if ($x == "lrdd") { $pagelink = $item->getAttribute('href'); break; } } } } if (isset($pagelink)) { return fetch_xrd_links($pagelink); } // next look in HTTP headers $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { // TODO alter the following regex to support multiple relations (space separated) if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) { $pagelink = $matches[1]; break; } // don't try and run feeds through the html5 parser if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) { return array(); } if (stristr($html, '<rss') || stristr($html, '<feed')) { return array(); } } } if (isset($pagelink)) { return fetch_xrd_links($pagelink); } // If we haven't found any links, return the host xrd links (which we have already fetched) if (isset($links)) { return $links; } return array(); }
public function testParse() { $result = HTML5_Parser::parse('<html><body></body></html>'); $this->assertIsA($result, 'DOMDocument'); }
function scrape_vcard($url) { $a = get_app(); $ret = array(); logger('scrape_vcard: url=' . $url); $s = fetch_url($url); if (!$s) { return $ret; } $headers = $a->get_curl_headers(); $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { // don't try and run feeds through the html5 parser if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) { return ret; } } } try { $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_vcard: parse error: ' . $e); } if (!$dom) { return $ret; } // Pull out hCard profile elements $largest_photo = 0; $items = $dom->getElementsByTagName('*'); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vcard')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'fn')) { $ret['fn'] = $x->textContent; } if (attribute_contains($x->getAttribute('class'), 'photo') || attribute_contains($x->getAttribute('class'), 'avatar')) { $size = intval($x->getAttribute('width')); if ($size > $largest_photo || !$largest_photo) { $ret['photo'] = $x->getAttribute('src'); $largest_photo = $size; } } if (attribute_contains($x->getAttribute('class'), 'nickname') || attribute_contains($x->getAttribute('class'), 'uid')) { $ret['nick'] = $x->textContent; } } } } return $ret; }
<?php require_once dirname(__FILE__) . '/../library/HTML5/Parser.php'; $argv = $_SERVER['argv']; if (!isset($argv[1])) { $file = 'php://stdin'; } else { $file = $argv[1]; } $result = HTML5_Parser::parse(file_get_contents($file)); // nop
function scrape_feed($url) { $a = get_app(); $ret = array(); $s = fetch_url($url); if (!$s) { return $ret; } $headers = $a->get_curl_headers(); logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG); $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { if (stristr($line, 'content-type:')) { if (stristr($line, 'application/atom+xml') || stristr($s, '<feed')) { $ret['feed_atom'] = $url; return $ret; } if (stristr($line, 'application/rss+xml') || stristr($s, '<rss')) { $ret['feed_rss'] = $url; return $ret; } } } } try { $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_feed: parse error: ' . $e); } if (!$dom) { return $ret; } $head = $dom->getElementsByTagName('base'); if ($head) { foreach ($head as $head0) { $basename = $head0->getAttribute('href'); break; } } if (!$basename) { $basename = substr($url, 0, strrpos($url, '/')) . '/'; } $items = $dom->getElementsByTagName('link'); // get Atom/RSS link elements, take the first one of either. if ($items) { foreach ($items as $item) { $x = $item->getAttribute('rel'); if ($x === 'alternate' && $item->getAttribute('type') === 'application/atom+xml') { if (!x($ret, 'feed_atom')) { $ret['feed_atom'] = $item->getAttribute('href'); } } if ($x === 'alternate' && $item->getAttribute('type') === 'application/rss+xml') { if (!x($ret, 'feed_rss')) { $ret['feed_rss'] = $item->getAttribute('href'); } } } } // Drupal and perhaps others only provide relative URL's. Turn them into absolute. if (x($ret, 'feed_atom') && !strstr($ret['feed_atom'], '://')) { $ret['feed_atom'] = $basename . $ret['feed_atom']; } if (x($ret, 'feed_rss') && !strstr($ret['feed_rss'], '://')) { $ret['feed_rss'] = $basename . $ret['feed_rss']; } return $ret; }
public static function trim($html, $maxLen = 25) { $html = strip_tags($html, '<img/>'); require_once get_template_directory() . '/core/inc/HTML5/Parser.php'; $dom = HTML5_Parser::parse($html); $html = new static(); $toRemove = $html->walk($dom, $maxLen); // remove any nodes that passed our limit foreach ($toRemove as $child) { $child->parentNode->removeChild($child); } // remove wrapper tags added by DD (doctype, html...) if (version_compare(PHP_VERSION, '5.3.6') < 0) { $dom->removeChild($dom->firstChild); $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild); return $dom->saveHTML(); } return $dom->saveHTML($dom->getElementsByTagName('body')->item(0)); }
/** * Main shortcode function * * @since 0.1 */ function file_gallery_shortcode($content = false, $attr = false) { global $file_gallery, $wpdb, $post; require_once 'html5lib/Parser.php'; // if the function is called directly, not via shortcode if (false !== $content && false === $attr) { $attr = wp_parse_args($content); } if (!isset($file_gallery->gallery_id)) { $file_gallery->gallery_id = 1; } else { $file_gallery->gallery_id++; } $options = get_option('file_gallery'); if (isset($options['cache']) && true == $options['cache']) { if ('html' == $attr['output_type'] || isset($options['cache_non_html_output']) && true == $options['cache_non_html_output']) { $transient = 'filegallery_' . md5($post->ID . "_" . serialize($attr)); $cache = get_transient($transient); if ($cache) { return $cache; } } } // if option to show galleries in excerpts is set to false... // ...replace [gallery] with user selected text if (!is_singular() && (!isset($options['in_excerpt']) || true != $options['in_excerpt'])) { return $options['in_excerpt_replace_content']; } $default_templates = unserialize(FILE_GALLERY_DEFAULT_TEMPLATES); // We're trusting author input, so let's at least make sure it looks like a valid orderby statement if (isset($attr['orderby'])) { $attr['orderby'] = sanitize_sql_orderby($attr['orderby']); if (!$attr['orderby']) { unset($attr['orderby']); } } $defaults = array('order' => 'ASC', 'orderby' => '', 'id' => $post->ID, 'columns' => 3, 'size' => 'thumbnail', 'link' => 'attachment', 'include' => '', 'ids' => '', 'exclude' => '', 'template' => 'default', 'linkclass' => '', 'imageclass' => '', 'galleryclass' => '', 'rel' => 1, 'tags' => '', 'tags_from' => 'current', 'output_type' => 'html', 'output_params' => 1, 'attachment_ids' => '', 'mimetype' => '', 'limit' => -1, 'offset' => -1, 'paginate' => 0, 'link_size' => 'full', 'include_meta' => false); if (floatval(get_bloginfo('version')) >= 3.5) { $defaults['link'] = 'post'; } // extract the defaults... extract(shortcode_atts($defaults, $attr)); if (!in_array($template, $default_templates)) { $template_file = FILE_GALLERY_THEME_TEMPLATES_ABSPATH . '/' . $template . '/gallery.php'; if (!is_readable($template_file)) { $template_file = FILE_GALLERY_CONTENT_TEMPLATES_ABSPATH . '/' . $template . '/gallery.php'; } } else { if ('default' == $template) { $template_file = FILE_GALLERY_DEFAULT_TEMPLATE_ABSPATH . '/gallery.php'; $template = FILE_GALLERY_DEFAULT_TEMPLATE_NAME; } else { $template_file = FILE_GALLERY_ABSPATH . '/templates/' . $template . '/gallery.php'; } } // check if template exists and replace with default if it does not if (!is_readable($template_file)) { $template_file = FILE_GALLERY_ABSPATH . '/templates/default/gallery.php'; $template = 'default'; } // get overriding variables from the template file $overriding = true; ob_start(); include $template_file; ob_end_clean(); $overriding = false; if (is_array($file_gallery->overrides) && !empty($file_gallery->overrides)) { extract($file_gallery->overrides); $file_gallery->overrides = NULL; } $limit = (int) $limit; $offset = (int) $offset; $page = (int) get_query_var('page'); // if( $captions === 'false' || $captions == '0' ) { // $captions = false; // } if ('false' === $rel || is_numeric($rel) && 0 === (int) $rel) { $_rel = false; } elseif (1 === $rel) { $_rel = true; } else { $_rel = $rel; } if ('false' === $output_params || is_numeric($output_params) && 0 === (int) $output_params) { $output_params = false; } else { $output_params = true; } if ('false' === $paginate || is_numeric($paginate) && 0 === (int) $paginate || 0 > $limit) { $paginate = false; $found_rows = ''; } else { $paginate = true; $found_rows = 'SQL_CALC_FOUND_ROWS'; if (0 === $page) { $page = 1; } if (is_singular() && 1 < $page) { $offset = $limit * ($page - 1); } } $file_gallery->debug_add('pagination', compact('paginate', 'page')); /**/ $_attachment_ids = explode(',', trim($attachment_ids, ',')); $_include = explode(',', trim($include, ',')); $_ids = explode(',', trim($ids, ',')); $attachment_ids = array_merge($_attachment_ids, $_include, $_ids); $attachment_ids = array_unique($attachment_ids); $attachment_ids = implode(',', $attachment_ids); $attachment_ids = trim($attachment_ids, ','); $attachment_ids = trim($attachment_ids); /**/ if (!isset($linkto)) { $linkto = $link; } $sql_mimetype = ''; if ('' != $mimetype) { $mimetype = file_gallery_get_mime_type($mimetype); $sql_mimetype = wp_post_mime_type_where($mimetype); } $approved_attachment_post_statuses = apply_filters('file_gallery_approved_attachment_post_statuses', array('inherit')); $ignored_attachment_post_statuses = apply_filters('file_gallery_ignored_attachment_post_statuses', array('trash', 'private', 'pending', 'future')); if (!empty($approved_attachment_post_statuses)) { $post_statuses = " AND (post_status IN ('" . implode("', '", $approved_attachment_post_statuses) . "') ) "; } elseif (!empty($ignored_attachment_post_statuses)) { $post_statuses = " AND (post_status NOT IN ('" . implode("', '", $ignored_attachment_post_statuses) . "') ) "; } else { $post_statuses = ""; } $file_gallery_query = new stdClass(); // start with tags because they negate everything else if ('' != $tags) { if ('' == $orderby || 'file_gallery' == $orderby) { $orderby = "menu_order ID"; } $query = array('post_status' => implode(',', $approved_attachment_post_statuses), 'post_type' => 'attachment', 'order' => $order, 'orderby' => $orderby, 'posts_per_page' => $limit, 'post_mime_type' => $mimetype, FILE_GALLERY_MEDIA_TAG_NAME => $tags); if ('current' == $tags_from) { $query['post_parent'] = $id; } if (!empty($exclude)) { $query['post__not_in'] = explode(',', preg_replace('/[^0-9,]+/', '', $exclude)); } if (0 < $offset) { $query['offset'] = $offset; } $file_gallery_query = new WP_Query($query); $attachments = $file_gallery_query->posts; unset($query); } elseif ('' != $attachment_ids) { $attachment_ids = explode(',', $attachment_ids); $sql_limit = count($attachment_ids); if ('rand' == $orderby) { shuffle($attachment_ids); } $attachment_ids = implode(',', $attachment_ids); if ('' == $orderby || 'rand' == $orderby || $orderby == 'post__in') { $orderby = sprintf("FIELD(ID, %s)", $attachment_ids); $order = ''; } elseif ('title' == $orderby) { $orderby = "post_title"; } $query = sprintf("SELECT " . $found_rows . " * FROM {$wpdb->posts} \r\n\t\t\t WHERE ID IN (%s) \r\n\t\t\t AND post_type = 'attachment' \r\n\t\t\t" . $post_statuses . " ", $attachment_ids); $query .= $sql_mimetype; $query .= sprintf(" ORDER BY %s %s ", $orderby, $order); if (true !== $paginate) { $limit = $sql_limit; } } else { if ('' == $orderby) { $orderby = "menu_order ID"; } $query = array('post_parent' => $id, 'post_status' => implode(',', $approved_attachment_post_statuses), 'post_type' => 'attachment', 'order' => $order, 'orderby' => $orderby, 'posts_per_page' => $limit, 'post_mime_type' => $mimetype); if (!empty($exclude)) { $query['post__not_in'] = explode(',', preg_replace('/[^0-9,]+/', '', $exclude)); } if (0 < $offset) { $query['offset'] = $offset; } $file_gallery_query = new WP_Query($query); $attachments = $file_gallery_query->posts; unset($query); } if (isset($query)) { if (0 < $limit) { $query .= " LIMIT " . $limit; } if (0 < $offset) { $query .= " OFFSET " . $offset; } $attachments = $wpdb->get_results($query); if ('' != $found_rows) { $file_gallery_query->found_posts = $wpdb->get_var("SELECT FOUND_ROWS()"); $file_gallery_query->max_num_pages = ceil($file_gallery_query->found_posts / $limit); } } $file_gallery->debug_add('attachments_query', compact('file_gallery_query')); if (empty($attachments)) { return '<!-- "File Gallery" plugin says: - No attachments found for the following shortcode arguments: "' . json_encode($attr) . '" -->'; } // feed if (is_feed()) { $output = "\n"; foreach ($attachments as $attachment) { $output .= wp_get_attachment_link($attachment->ID, $size, true) . "\n"; } return $output; } $i = 0; $unique_ids = array(); $gallery_items = ''; if ('object' == $output_type || 'array' == $output_type) { $gallery_items = array(); } $autoqueueclasses = array(); if (defined('FILE_GALLERY_LIGHTBOX_CLASSES')) { $autoqueueclasses = maybe_unserialize(FILE_GALLERY_LIGHTBOX_CLASSES); } else { $autoqueueclasses = explode(',', $options['auto_enqueued_scripts']); } $file_gallery_this_template_counter = 1; // create output foreach ($attachments as $attachment) { $param = array('image_class' => $imageclass, 'link_class' => $linkclass, 'rel' => $_rel, 'title' => '', 'caption' => '', 'description' => '', 'thumb_alt' => ''); $attachment_file = get_attached_file($attachment->ID); $attachment_is_image = file_gallery_file_is_displayable_image($attachment_file); $startcol = ''; $endcol = ''; $x = ''; if ($output_params) { $plcai = array_intersect($autoqueueclasses, explode(' ', trim($linkclass))); if (!empty($plcai)) { if ($attachment_is_image) { if (true === $param['rel']) { $param['rel'] = $plcai[0] . '[' . $file_gallery->gallery_id . ']'; } elseif (!is_bool($param['rel'])) { if (false !== strpos($_rel, '$GID$')) { $param['rel'] = str_replace('$GID$', $file_gallery->gallery_id, $_rel); } else { $param['rel'] = $_rel . '[' . $file_gallery->gallery_id . ']'; } } $filter_args = array('gallery_id' => $file_gallery->gallery_id, 'linkrel' => $param['rel'], 'linkclass' => $param['link_class'], 'imageclass' => $param['image_class']); if ($param['rel']) { $param['rel'] = apply_filters('file_gallery_lightbox_linkrel', $param['rel'], 'linkrel', $filter_args); } $param['link_class'] = apply_filters('file_gallery_lightbox_linkclass', $param['link_class'], 'linkclass', $filter_args); $param['image_class'] = apply_filters('file_gallery_lightbox_imageclass', $param['image_class'], 'imageclass', $filter_args); } else { $param['link_class'] = str_replace(trim(implode(' ', $plcai)), '', trim($linkclass)); } } // if rel is still true or false if (is_bool($param['rel'])) { $param['rel'] = ''; } switch ($linkto) { case 'parent_post': $param['link'] = get_permalink($wpdb->get_var("SELECT post_parent FROM {$wpdb->posts} WHERE ID = '" . $attachment->ID . "'")); break; case 'file': $param['link'] = wp_get_attachment_url($attachment->ID); break; case 'attachment': case 'post': $param['link'] = get_attachment_link($attachment->ID); break; case 'none': $param['link'] = ''; break; default: // external url $param['link'] = urldecode($linkto); break; } $param['title'] = $attachment->post_title; // $param['caption'] = $captions !== false ? $attachment->post_excerpt : ''; $param['caption'] = $attachment->post_excerpt; $param['description'] = $attachment->post_content; if ($attachment_is_image) { $thumb_src = wp_get_attachment_image_src($attachment->ID, $size); $param['thumb_link'] = $thumb_src[0]; $param['thumb_width'] = 0 == $thumb_src[1] ? file_gallery_get_image_size($param['thumb_link']) : $thumb_src[1]; $param['thumb_height'] = 0 == $thumb_src[2] ? file_gallery_get_image_size($param['thumb_link'], true) : $thumb_src[2]; if ('' != $param['link'] && 'full' != $link_size && in_array($link_size, file_gallery_get_intermediate_image_sizes())) { $full_src = wp_get_attachment_image_src($attachment->ID, $link_size); $param['link'] = $full_src[0]; } } else { $param['thumb_link'] = wp_mime_type_icon($attachment->ID); $param['thumb_link'] = apply_filters('file_gallery_non_image_thumb_link', $param['thumb_link'], $attachment->post_mime_type, $attachment->ID); $param['thumb_width'] = '46'; $param['thumb_height'] = '60'; } if ($thumb_alt = get_post_meta($attachment->ID, '_wp_attachment_image_alt', true)) { $param['thumb_alt'] = $thumb_alt; } $param['attachment_id'] = $attachment->ID; } /** * Make sure that all attributes added/filtered via * 'wp_get_attachment_link' filter are included here as well */ /** $dom_document = new DOMDocument(); @$dom_document->loadHTML(wp_get_attachment_link($attachment->ID)); // $wp_attachment_link_attributes = $dom_document->getElementsByTagName('a')->item(0)->attributes; **/ /** $wp_attachment_link = new SimpleXMLElement(wp_get_attachment_link($attachment->ID)); $wp_attachment_link_attributes = $wp_attachment_link->attributes(); foreach( $wp_attachment_link_attributes as $key => $val ) { if( $key === 'title' ) { $param['title'] = $val; } else if( $key === 'class' ) { $param['link_class'] .= ' ' . $val; } else if( $key === 'rel' ) { $param['rel'] .= ' ' . $val; } } **/ $dom_document = HTML5_Parser::parse(wp_get_attachment_link($attachment->ID)); $wp_attachment_link_attributes = $dom_document->getElementsByTagName("a")->item(0)->attributes; $length = $wp_attachment_link_attributes->length; for ($i = 0; $i < $length; ++$i) { $name = $wp_attachment_link_attributes->item($i)->name; $value = $wp_attachment_link_attributes->item($i)->value; if ($name === 'title') { $param['title'] = $value; } else { if ($name === 'class') { $param['link_class'] .= ' ' . $value; } else { if ($name === 'rel') { $param['rel'] .= ' ' . $value; } } } } $param = array_map('trim', $param); if ($include_meta) { $meta = get_post_custom($attachment->ID); } if ('object' == $output_type) { if ($output_params) { $attachment->params = (object) $param; } if ($include_meta) { $attachment->meta = (object) $meta; } $gallery_items[] = $attachment; } elseif ('array' == $output_type || 'json' == $output_type) { if ($output_params) { $attachment->params = $param; } if ($include_meta) { $attachment->meta = $meta; } $gallery_items[] = get_object_vars($attachment); } else { if ($columns > 0) { if (0 === $i || 0 === $i % $columns) { $startcol = ' gallery-startcol'; } elseif (($i + 1) % $columns == 0) { // add the column break class $endcol = ' gallery-endcol'; } } // parse template ob_start(); extract($param); include $template_file; $x = ob_get_contents(); ob_end_clean(); $file_gallery_this_template_counter++; if ($columns > 0 && $i + 1 % $columns == 0) { $x .= $cleartag; } $gallery_items .= $x; $i++; } } // handle data types if ('object' == $output_type || 'array' == $output_type) { $output = $gallery_items; } elseif ('json' == $output_type) { $output = json_encode($gallery_items); } else { $stc = ''; $cols = ''; $pagination_html = ''; if (0 < (int) $columns) { $cols = ' columns_' . $columns; } if (isset($starttag_class) && '' != $starttag_class) { $stc = ' ' . $starttag_class; } $trans_append = "\n<!-- file gallery output cached on " . date('Y.m.d @ H:i:s', time()) . "-->\n"; if (is_singular() && isset($file_gallery_query->max_num_pages) && 1 < $file_gallery_query->max_num_pages) { $pagination_html = file_gallery_do_pagination($file_gallery_query->max_num_pages, $page); } $gallery_class = apply_filters('file_gallery_galleryclass', 'gallery ' . str_replace(' ', '-', $template) . $cols . $stc . ' ' . $galleryclass); $output = '<' . $starttag . ' id="gallery-' . $file_gallery->gallery_id . '" class="' . $gallery_class . '">' . "\n" . $gallery_items . "\n" . $pagination_html . "\n</" . $starttag . '>'; } if (isset($options['cache']) && true == $options['cache']) { if ('html' == $output_type) { set_transient($transient, $output . $trans_append, $options['cache_time']); } elseif (isset($options['cache_non_html_output']) && true == $options['cache_non_html_output']) { set_transient($transient, $output, $options['cache_time']); } } return apply_filters('file_gallery_output', $output, $post->ID, $file_gallery->gallery_id); }
function scrape_feed($url) { $a = get_app(); $ret = array(); $level = 0; $x = z_fetch_url($url, false, $level, array('novalidate' => true)); if (!$x['success']) { return $ret; } $headers = $x['header']; $code = $x['return_code']; $s = $x['body']; logger('scrape_feed: returns: ' . $code . ' headers=' . $headers, LOGGER_DEBUG); if (!$s) { logger('scrape_feed: no data returned for ' . $url); return $ret; } $lines = explode("\n", $headers); if (count($lines)) { foreach ($lines as $line) { if (stristr($line, 'content-type:')) { if (stristr($line, 'application/atom+xml') || stristr($s, '<feed')) { $ret['feed_atom'] = $url; return $ret; } if (stristr($line, 'application/rss+xml') || stristr($s, '<rss')) { $ret['feed_rss'] = $url; return $ret; } } } // perhaps an RSS version 1 feed with a generic or incorrect content-type? if (stristr($s, '</item>')) { $ret['feed_rss'] = $url; return $ret; } } try { $dom = HTML5_Parser::parse($s); } catch (DOMException $e) { logger('scrape_feed: parse error: ' . $e); } if (!$dom) { logger('scrape_feed: failed to parse.'); return $ret; } $head = $dom->getElementsByTagName('base'); if ($head) { foreach ($head as $head0) { $basename = $head0->getAttribute('href'); break; } } if (!$basename) { $basename = implode('/', array_slice(explode('/', $url), 0, 3)) . '/'; } $items = $dom->getElementsByTagName('link'); // get Atom/RSS link elements, take the first one of either. if ($items) { foreach ($items as $item) { $x = $item->getAttribute('rel'); if ($x === 'alternate' && $item->getAttribute('type') === 'application/atom+xml') { if (!x($ret, 'feed_atom')) { $ret['feed_atom'] = $item->getAttribute('href'); } } if ($x === 'alternate' && $item->getAttribute('type') === 'application/rss+xml') { if (!x($ret, 'feed_rss')) { $ret['feed_rss'] = $item->getAttribute('href'); } } } } // Drupal and perhaps others only provide relative URL's. Turn them into absolute. if (x($ret, 'feed_atom') && !strstr($ret['feed_atom'], '://')) { $ret['feed_atom'] = $basename . $ret['feed_atom']; } if (x($ret, 'feed_rss') && !strstr($ret['feed_rss'], '://')) { $ret['feed_rss'] = $basename . $ret['feed_rss']; } return $ret; }
function scrape_dfrn($url, $max_nodes = 3500) { $minNodes = 100; //Lets do at least 100 nodes per type. $timeout = 10; //Timeout will affect batch processing. //Try and cheat our way into faster profiles. if (strpos($url, 'tab=profile') === false) { $url .= (strpos($url, '?') > 0 ? '&' : '?') . 'tab=profile'; } $scrape_start = microtime(true); $ret = array(); $s = fetch_url($url, $timeout); $scrape_fetch_end = microtime(true); if (!$s) { return $ret; } $dom = HTML5_Parser::parse($s); if (!$dom) { return $ret; } $items = $dom->getElementsByTagName('meta'); // get DFRN link elements $nodes_left = max(intval($max_nodes), $minNodes); $targets = array('hide', 'comm', 'tags'); $targets_left = count($targets); foreach ($items as $item) { $x = $item->getAttribute('name'); if ($x == 'dfrn-global-visibility') { $z = strtolower(trim($item->getAttribute('content'))); if ($z != 'true') { $ret['hide'] = 1; } if ($z === 'false') { $ret['explicit-hide'] = 1; } $targets_left = pop_scrape_target($targets, 'hide'); } if ($x == 'friendika.community' || $x == 'friendica.community') { $z = strtolower(trim($item->getAttribute('content'))); if ($z == 'true') { $ret['comm'] = 1; } $targets_left = pop_scrape_target($targets, 'comm'); } if ($x == 'keywords') { $z = str_replace(',', ' ', strtolower(trim($item->getAttribute('content')))); if (strlen($z)) { $ret['tags'] = $z; } $targets_left = pop_scrape_target($targets, 'tags'); } $nodes_left--; if ($nodes_left <= 0 || $targets_left <= 0) { break; } } $items = $dom->getElementsByTagName('link'); // get DFRN link elements $nodes_left = max(intval($max_nodes), $minNodes); foreach ($items as $item) { $x = $item->getAttribute('rel'); if (substr($x, 0, 5) == "dfrn-") { $ret[$x] = $item->getAttribute('href'); } $nodes_left--; if ($nodes_left <= 0) { break; } } // Pull out hCard profile elements $nodes_left = max(intval($max_nodes), $minNodes); $items = $dom->getElementsByTagName('*'); $targets = array('fn', 'pdesc', 'photo', 'key', 'locality', 'region', 'postal-code', 'country-name', 'gender', 'marital'); $targets_left = count($targets); foreach ($items as $item) { if (attribute_contains($item->getAttribute('class'), 'vcard')) { $level2 = $item->getElementsByTagName('*'); foreach ($level2 as $x) { if (attribute_contains($x->getAttribute('class'), 'fn')) { $ret['fn'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'fn'); } if (attribute_contains($x->getAttribute('class'), 'title')) { $ret['pdesc'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'pdesc'); } if (attribute_contains($x->getAttribute('class'), 'photo')) { $ret['photo'] = $x->getAttribute('src'); $targets_left = pop_scrape_target($targets, 'photo'); } if (attribute_contains($x->getAttribute('class'), 'key')) { $ret['key'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'key'); } if (attribute_contains($x->getAttribute('class'), 'locality')) { $ret['locality'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'locality'); } if (attribute_contains($x->getAttribute('class'), 'region')) { $ret['region'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'region'); } if (attribute_contains($x->getAttribute('class'), 'postal-code')) { $ret['postal-code'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'postal-code'); } if (attribute_contains($x->getAttribute('class'), 'country-name')) { $ret['country-name'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'country-name'); } if (attribute_contains($x->getAttribute('class'), 'x-gender')) { $ret['gender'] = $x->textContent; $targets_left = pop_scrape_target($targets, 'gender'); } } } if (attribute_contains($item->getAttribute('class'), 'marital-text')) { $ret['marital'] = $item->textContent; $targets_left = pop_scrape_target($targets, 'marital'); } $nodes_left--; if ($nodes_left <= 0 || $targets_left <= 0) { break; } } $scrape_end = microtime(true); $fetch_time = round(($scrape_fetch_end - $scrape_start) * 1000); $scrape_time = round(($scrape_end - $scrape_fetch_end) * 1000); $ret['_timings'] = array('fetch' => $fetch_time, 'scrape' => $scrape_time); return $ret; }
/** * Create instance of Readability * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (for footnotes) * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param boolean (optional) Use tidy */ function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) { $this->url = $url; $this->debugText = 'Parsing URL: ' . $url . "\n"; if ($url) { $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($url)['host']), array('.' => '\\.')) . '/'; } mb_internal_encoding("UTF-8"); mb_http_output("UTF-8"); mb_regex_encoding("UTF-8"); $this->imageCache = new ImageCaching(); // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well... if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) { try { foreach ($this->pre_filters as $search => $replace) { $html = preg_replace($search, $replace, $html); } unset($search, $replace); } catch (Exception $e) { $this->debugText .= "Cleaning raw HTML failed. Ignoring: " . $e->getMessage(); } } if (trim($html) === '') { $html = '<html></html>'; } /** * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. * **/ if ($use_tidy && function_exists('tidy_parse_string')) { $this->debugText .= 'Tidying document' . "\n"; $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8'); if (tidy_clean_repair($tidy)) { $original_html = $html; $this->tidied = true; $html = $tidy->value; $html = preg_replace('/<html[^>]+>/i', '<html>', $html); $html = preg_replace('/[\\r\\n]+/is', "\n", $html); } unset($tidy); } $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) { // all good } else { libxml_use_internal_errors(true); $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; @$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR); } $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); }
function cc_wordpress_article_filter($article) { require_once 'lib/html5lib/Parser.php'; // sorry, but parseFragment() returns a DomNodeList, which is as inflexible as it gets $dom = HTML5_Parser::parse($article); $tagnames = array('img', 'audio', 'video', 'object'); foreach ($tagnames as $tagname) { foreach ($dom->getElementsByTagName($tagname) as $element) { $class = $element->getAttribute('class'); // relevant class name example: wp-image-18 preg_match('/wp-(image|audio|video|object)-([0-9]*)/', $class, $matches); $id = $matches[2]; // relevant class name example: size-medium preg_match('/size-(.*)/', $class, $matches); $size = $matches[1]; // TODO: make cc_wordpress_figure() take and return a DOM fragment $figure_html = cc_wordpress_figure($id, $size, false); // only replace node if we actually got something if ($figure_html) { $figure = HTML5_Parser::parseFragment($figure_html)->item(0)->getElementsByTagName('figure')->item(0); // a document context change is needed before appending the node $figure = $dom->importNode($figure, True); $element->parentNode->replaceChild($figure, $element); } } } // hackish but reliable way to serialize the DOM // TODO: fix this mess $XML = $dom->saveXML($dom->getElementsByTagName('body')->item(0)); $XML = str_replace('<body>', '', $XML); $XML = str_replace('</body>', '', $XML); // work around a bug regarding <style> elements including CSS '>' selectors $XML = str_replace('>', '>', $XML); // work around the IE bug that some elements are serialized with a null namespace $XML = str_replace('embedNode.value = helperNode.innerHTML;', 'embedNode.value = helperNode.innerHTML.replace(/<:/g,"<").replace(/<.:/g,"</");', $XML); return $XML; }