Example #1
0
function parse_event($h)
{
    require_once 'include/Scrape.php';
    require_once 'library/HTMLPurifier.auto.php';
    require_once 'include/html2bbcode';
    $h = '<html><body>' . $h . '</body></html>';
    $ret = array();
    try {
        $dom = HTML5_Parser::parse($h);
    } catch (DOMException $e) {
        logger('parse_event: parse error: ' . $e);
    }
    if (!$dom) {
        return $ret;
    }
    $items = $dom->getElementsByTagName('*');
    foreach ($items as $item) {
        if (attribute_contains($item->getAttribute('class'), 'vevent')) {
            $level2 = $item->getElementsByTagName('*');
            foreach ($level2 as $x) {
                if (attribute_contains($x->getAttribute('class'), 'dtstart') && $x->getAttribute('title')) {
                    $ret['start'] = $x->getAttribute('title');
                    if (!strpos($ret['start'], 'Z')) {
                        $ret['adjust'] = true;
                    }
                }
                if (attribute_contains($x->getAttribute('class'), 'dtend') && $x->getAttribute('title')) {
                    $ret['finish'] = $x->getAttribute('title');
                }
                if (attribute_contains($x->getAttribute('class'), 'description')) {
                    $ret['desc'] = $x->textContent;
                }
                if (attribute_contains($x->getAttribute('class'), 'location')) {
                    $ret['location'] = $x->textContent;
                }
            }
        }
    }
    // sanitise
    if (x($ret, 'desc') && (strpos($ret['desc'], '<') !== false || strpos($ret['desc'], '>') !== false)) {
        $config = HTMLPurifier_Config::createDefault();
        $config->set('Cache.DefinitionImpl', null);
        $purifier = new HTMLPurifier($config);
        $ret['desc'] = html2bbcode($purifier->purify($ret['desc']));
    }
    if (x($ret, 'location') && (strpos($ret['location'], '<') !== false || strpos($ret['location'], '>') !== false)) {
        $config = HTMLPurifier_Config::createDefault();
        $config->set('Cache.DefinitionImpl', null);
        $purifier = new HTMLPurifier($config);
        $ret['location'] = html2bbcode($purifier->purify($ret['location']));
    }
    if (x($ret, 'start')) {
        $ret['start'] = datetime_convert('UTC', 'UTC', $ret['start']);
    }
    if (x($ret, 'finish')) {
        $ret['finish'] = datetime_convert('UTC', 'UTC', $ret['finish']);
    }
    return $ret;
}
Example #2
0
function cleanUpPost($postText, $name = "", $noSmilies = false)
{
    global $filter_tags, $bbcode, $postNoSmilies;
    $postNoSmilies = $noSmilies;
    require_once 'HTML5/Parser.php';
    $document = HTML5_Parser::parseFragment($postText, null, null, $filter_tags, $bbcode, $name)->item(0)->ownerDocument;
    // The DOM tree is empty. Ignore it.
    if (!$document) {
        return "";
    }
    process($document);
    return $document->saveHTML();
}
Example #3
0
function parse_url_content(&$a)
{
    $url = trim($_GET['url']);
    $template = "<a href=\"%s\" >%s</a>%s";
    if ($url) {
        $s = fetch_url($url);
    } else {
        echo '';
        killme();
    }
    if (!$s) {
        echo sprintf($template, $url, $url, '');
        killme();
    }
    $dom = HTML5_Parser::parse($s);
    if (!$dom) {
        return $ret;
    }
    $items = $dom->getElementsByTagName('title');
    if ($items) {
        foreach ($items as $item) {
            $title = $item->textContent;
            break;
        }
    }
    $items = $dom->getElementsByTagName('p');
    if ($items) {
        foreach ($items as $item) {
            $text = $item->textContent;
            $text = strip_tags($text);
            if (strlen($text) < 100) {
                continue;
            }
            $text = substr($text, 0, 250) . '...';
            break;
        }
    }
    if (strlen($text)) {
        $text = '<br />' . $text;
    }
    echo sprintf($template, $url, $title, $text);
    killme();
}
Example #4
0
 function scrape_meta($url)
 {
     $ret = array();
     $s = fetch_url($url);
     if (!$s) {
         return $ret;
     }
     $dom = HTML5_Parser::parse($s);
     if (!$dom) {
         return $ret;
     }
     $items = $dom->getElementsByTagName('meta');
     // get DFRN link elements
     foreach ($items as $item) {
         $x = $item->getAttribute('name');
         if (substr($x, 0, 5) == "dfrn-") {
             $ret[$x] = $item->getAttribute('content');
         }
     }
     return $ret;
 }
 public function setContent($content)
 {
     require_once HTML5LIB_PATH . '/HTML5/Parser.php';
     // Convert any errors to exceptions
     set_error_handler(function ($no, $str) {
         throw new Exception("HTML Parse Error: " . $str);
     }, error_reporting());
     // Use HTML5lib to parse the HTML fragment
     try {
         $document = HTML5_Parser::parse('<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>' . "<body>{$content}</body></html>");
     } catch (Exception $e) {
         $document = false;
     }
     // Disable our error handler (restoring to previous value)
     restore_error_handler();
     // If we couldn't parse the HTML, set the error state
     if ($document) {
         $this->setDocument($document);
     } else {
         $this->setInvalid();
     }
 }
Example #6
0
 /**
  * Create instance of Readability
  * @param string UTF-8 encoded string
  * @param string (optional) URL associated with HTML (used for footnotes)
  * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
  */
 function __construct($html, $url = null, $parser = 'libxml')
 {
     $this->url = $url;
     /* Turn all double br's into p's */
     $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
     $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     if (trim($html) == '') {
         $html = '<html></html>';
     }
     if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
         // all good
     } else {
         $this->dom = new DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         @$this->dom->loadHTML($html);
     }
     $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 }
Example #7
0
 function lrdd($uri)
 {
     $a = get_app();
     // default priority is host priority, host-meta first
     $priority = 'host';
     // All we have is an email address. Resource-priority is irrelevant
     // because our URI isn't directly resolvable.
     if (strstr($uri, '@')) {
         return webfinger($uri);
     }
     // get the host meta file
     $host = @parse_url($uri);
     if ($host) {
         $url = (x($host, 'scheme') ? $host['scheme'] : 'http') . '://';
         $url .= $host['host'] . '/.well-known/host-meta';
     } else {
         return array();
     }
     logger('lrdd: constructed url: ' . $url);
     $xml = fetch_url($url);
     $headers = $a->get_curl_headers();
     if (!$xml) {
         return array();
     }
     logger('lrdd: host_meta: ' . $xml, LOGGER_DATA);
     $h = parse_xml_string($xml);
     if (!$h) {
         return array();
     }
     $arr = convert_xml_element_to_array($h);
     if (isset($arr['xrd']['property'])) {
         $property = $arr['crd']['property'];
         if (!isset($property[0])) {
             $properties = array($property);
         } else {
             $properties = $property;
         }
         foreach ($properties as $prop) {
             if ((string) $prop['@attributes'] === 'http://lrdd.net/priority/resource') {
                 $priority = 'resource';
             }
         }
     }
     // save the links in case we need them
     $links = array();
     if (isset($arr['xrd']['link'])) {
         $link = $arr['xrd']['link'];
         if (!isset($link[0])) {
             $links = array($link);
         } else {
             $links = $link;
         }
     }
     // do we have a template or href?
     if (count($links)) {
         foreach ($links as $link) {
             if ($link['@attributes']['rel'] && attribute_contains($link['@attributes']['rel'], 'lrdd')) {
                 if (x($link['@attributes'], 'template')) {
                     $tpl = $link['@attributes']['template'];
                 } elseif (x($link['@attributes'], 'href')) {
                     $href = $link['@attributes']['href'];
                 }
             }
         }
     }
     if (!isset($tpl) || !strpos($tpl, '{uri}')) {
         $tpl = '';
     }
     if ($priority === 'host') {
         if (strlen($tpl)) {
             $pxrd = str_replace('{uri}', urlencode($uri), $tpl);
         } elseif (isset($href)) {
             $pxrd = $href;
         }
         if (isset($pxrd)) {
             logger('lrdd: (host priority) pxrd: ' . $pxrd);
             $links = fetch_xrd_links($pxrd);
             return $links;
         }
         $lines = explode("\n", $headers);
         if (count($lines)) {
             foreach ($lines as $line) {
                 if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) {
                     return fetch_xrd_links($matches[1]);
                     break;
                 }
             }
         }
     }
     // priority 'resource'
     $html = fetch_url($uri);
     $headers = $a->get_curl_headers();
     logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
     // don't try and parse raw xml as html
     if (!strstr($html, '<?xml')) {
         require_once 'library/HTML5/Parser.php';
         try {
             $dom = HTML5_Parser::parse($html);
         } catch (DOMException $e) {
             logger('lrdd: parse error: ' . $e);
         }
         if ($dom) {
             $items = $dom->getElementsByTagName('link');
             foreach ($items as $item) {
                 $x = $item->getAttribute('rel');
                 if ($x == "lrdd") {
                     $pagelink = $item->getAttribute('href');
                     break;
                 }
             }
         }
     }
     if (isset($pagelink)) {
         return fetch_xrd_links($pagelink);
     }
     // next look in HTTP headers
     $lines = explode("\n", $headers);
     if (count($lines)) {
         foreach ($lines as $line) {
             // TODO alter the following regex to support multiple relations (space separated)
             if (stristr($line, 'link:') && preg_match('/<([^>].*)>.*rel\\=[\'\\"]lrdd[\'\\"]/', $line, $matches)) {
                 $pagelink = $matches[1];
                 break;
             }
             // don't try and run feeds through the html5 parser
             if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) {
                 return array();
             }
             if (stristr($html, '<rss') || stristr($html, '<feed')) {
                 return array();
             }
         }
     }
     if (isset($pagelink)) {
         return fetch_xrd_links($pagelink);
     }
     // If we haven't found any links, return the host xrd links (which we have already fetched)
     if (isset($links)) {
         return $links;
     }
     return array();
 }
Example #8
0
 public function testParseFragment()
 {
     $result = HTML5_Parser::parseFragment('<b>asdf</b> foo');
     $this->assertIsA($result, 'DOMNodeList');
 }
Example #9
0
 function scrape_dfrn($url, $max_nodes = 3500)
 {
     $minNodes = 100;
     //Lets do at least 100 nodes per type.
     $timeout = 10;
     //Timeout will affect batch processing.
     //Try and cheat our way into faster profiles.
     if (strpos($url, 'tab=profile') === false) {
         $url .= (strpos($url, '?') > 0 ? '&' : '?') . 'tab=profile';
     }
     $scrape_start = microtime(true);
     $ret = array();
     $s = fetch_url($url, $timeout);
     $scrape_fetch_end = microtime(true);
     if (!$s) {
         return $ret;
     }
     $dom = HTML5_Parser::parse($s);
     if (!$dom) {
         return $ret;
     }
     $items = $dom->getElementsByTagName('meta');
     // get DFRN link elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     $targets = array('hide', 'comm', 'tags');
     $targets_left = count($targets);
     foreach ($items as $item) {
         $x = $item->getAttribute('name');
         if ($x == 'dfrn-global-visibility') {
             $z = strtolower(trim($item->getAttribute('content')));
             if ($z != 'true') {
                 $ret['hide'] = 1;
             }
             if ($z === 'false') {
                 $ret['explicit-hide'] = 1;
             }
             $targets_left = pop_scrape_target($targets, 'hide');
         }
         if ($x == 'friendika.community' || $x == 'friendica.community') {
             $z = strtolower(trim($item->getAttribute('content')));
             if ($z == 'true') {
                 $ret['comm'] = 1;
             }
             $targets_left = pop_scrape_target($targets, 'comm');
         }
         if ($x == 'keywords') {
             $z = str_replace(',', ' ', strtolower(trim($item->getAttribute('content'))));
             if (strlen($z)) {
                 $ret['tags'] = $z;
             }
             $targets_left = pop_scrape_target($targets, 'tags');
         }
         $nodes_left--;
         if ($nodes_left <= 0 || $targets_left <= 0) {
             break;
         }
     }
     $items = $dom->getElementsByTagName('link');
     // get DFRN link elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     foreach ($items as $item) {
         $x = $item->getAttribute('rel');
         if (substr($x, 0, 5) == "dfrn-") {
             $ret[$x] = $item->getAttribute('href');
         }
         $nodes_left--;
         if ($nodes_left <= 0) {
             break;
         }
     }
     // Pull out hCard profile elements
     $nodes_left = max(intval($max_nodes), $minNodes);
     $items = $dom->getElementsByTagName('*');
     $targets = array('fn', 'pdesc', 'photo', 'key', 'locality', 'region', 'postal-code', 'country-name', 'gender', 'marital');
     $targets_left = count($targets);
     foreach ($items as $item) {
         if (attribute_contains($item->getAttribute('class'), 'vcard')) {
             $level2 = $item->getElementsByTagName('*');
             foreach ($level2 as $x) {
                 if (attribute_contains($x->getAttribute('class'), 'fn')) {
                     $ret['fn'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'fn');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'title')) {
                     $ret['pdesc'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'pdesc');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'photo')) {
                     $ret['photo'] = $x->getAttribute('src');
                     $targets_left = pop_scrape_target($targets, 'photo');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'key')) {
                     $ret['key'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'key');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'locality')) {
                     $ret['locality'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'locality');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'region')) {
                     $ret['region'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'region');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'postal-code')) {
                     $ret['postal-code'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'postal-code');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'country-name')) {
                     $ret['country-name'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'country-name');
                 }
                 if (attribute_contains($x->getAttribute('class'), 'x-gender')) {
                     $ret['gender'] = $x->textContent;
                     $targets_left = pop_scrape_target($targets, 'gender');
                 }
             }
         }
         if (attribute_contains($item->getAttribute('class'), 'marital-text')) {
             $ret['marital'] = $item->textContent;
             $targets_left = pop_scrape_target($targets, 'marital');
         }
         $nodes_left--;
         if ($nodes_left <= 0 || $targets_left <= 0) {
             break;
         }
     }
     $scrape_end = microtime(true);
     $fetch_time = round(($scrape_fetch_end - $scrape_start) * 1000);
     $scrape_time = round(($scrape_end - $scrape_fetch_end) * 1000);
     $ret['_timings'] = array('fetch' => $fetch_time, 'scrape' => $scrape_time);
     return $ret;
 }
Example #10
0
function scrape_feed($url)
{
    $a = get_app();
    $ret = array();
    $level = 0;
    $x = z_fetch_url($url, false, $level, array('novalidate' => true));
    if (!$x['success']) {
        return $ret;
    }
    $headers = $x['header'];
    $code = $x['return_code'];
    $s = $x['body'];
    logger('scrape_feed: returns: ' . $code . ' headers=' . $headers, LOGGER_DEBUG);
    if (!$s) {
        logger('scrape_feed: no data returned for ' . $url);
        return $ret;
    }
    $lines = explode("\n", $headers);
    if (count($lines)) {
        foreach ($lines as $line) {
            if (stristr($line, 'content-type:')) {
                if (stristr($line, 'application/atom+xml') || stristr($s, '<feed')) {
                    $ret['feed_atom'] = $url;
                    return $ret;
                }
                if (stristr($line, 'application/rss+xml') || stristr($s, '<rss')) {
                    $ret['feed_rss'] = $url;
                    return $ret;
                }
            }
        }
        // perhaps an RSS version 1 feed with a generic or incorrect content-type?
        if (stristr($s, '</item>')) {
            $ret['feed_rss'] = $url;
            return $ret;
        }
    }
    try {
        $dom = HTML5_Parser::parse($s);
    } catch (DOMException $e) {
        logger('scrape_feed: parse error: ' . $e);
    }
    if (!$dom) {
        logger('scrape_feed: failed to parse.');
        return $ret;
    }
    $head = $dom->getElementsByTagName('base');
    if ($head) {
        foreach ($head as $head0) {
            $basename = $head0->getAttribute('href');
            break;
        }
    }
    if (!$basename) {
        $basename = implode('/', array_slice(explode('/', $url), 0, 3)) . '/';
    }
    $items = $dom->getElementsByTagName('link');
    // get Atom/RSS link elements, take the first one of either.
    if ($items) {
        foreach ($items as $item) {
            $x = $item->getAttribute('rel');
            if ($x === 'alternate' && $item->getAttribute('type') === 'application/atom+xml') {
                if (!x($ret, 'feed_atom')) {
                    $ret['feed_atom'] = $item->getAttribute('href');
                }
            }
            if ($x === 'alternate' && $item->getAttribute('type') === 'application/rss+xml') {
                if (!x($ret, 'feed_rss')) {
                    $ret['feed_rss'] = $item->getAttribute('href');
                }
            }
        }
    }
    // Drupal and perhaps others only provide relative URL's. Turn them into absolute.
    if (x($ret, 'feed_atom') && !strstr($ret['feed_atom'], '://')) {
        $ret['feed_atom'] = $basename . $ret['feed_atom'];
    }
    if (x($ret, 'feed_rss') && !strstr($ret['feed_rss'], '://')) {
        $ret['feed_rss'] = $basename . $ret['feed_rss'];
    }
    return $ret;
}
Example #11
0
<?php

require_once dirname(__FILE__) . '/../library/HTML5/Parser.php';
$argv = $_SERVER['argv'];
if (!isset($argv[1])) {
    $file = 'php://stdin';
} else {
    $file = $argv[1];
}
$result = HTML5_Parser::parse(file_get_contents($file));
// nop
Example #12
0
 function scrape_feed($url)
 {
     $a = get_app();
     $ret = array();
     $s = fetch_url($url);
     if (!$s) {
         return $ret;
     }
     $headers = $a->get_curl_headers();
     logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
     $lines = explode("\n", $headers);
     if (count($lines)) {
         foreach ($lines as $line) {
             if (stristr($line, 'content-type:')) {
                 if (stristr($line, 'application/atom+xml') || stristr($s, '<feed')) {
                     $ret['feed_atom'] = $url;
                     return $ret;
                 }
                 if (stristr($line, 'application/rss+xml') || stristr($s, '<rss')) {
                     $ret['feed_rss'] = $url;
                     return $ret;
                 }
             }
         }
     }
     try {
         $dom = HTML5_Parser::parse($s);
     } catch (DOMException $e) {
         logger('scrape_feed: parse error: ' . $e);
     }
     if (!$dom) {
         return $ret;
     }
     $head = $dom->getElementsByTagName('base');
     if ($head) {
         foreach ($head as $head0) {
             $basename = $head0->getAttribute('href');
             break;
         }
     }
     if (!$basename) {
         $basename = substr($url, 0, strrpos($url, '/')) . '/';
     }
     $items = $dom->getElementsByTagName('link');
     // get Atom/RSS link elements, take the first one of either.
     if ($items) {
         foreach ($items as $item) {
             $x = $item->getAttribute('rel');
             if ($x === 'alternate' && $item->getAttribute('type') === 'application/atom+xml') {
                 if (!x($ret, 'feed_atom')) {
                     $ret['feed_atom'] = $item->getAttribute('href');
                 }
             }
             if ($x === 'alternate' && $item->getAttribute('type') === 'application/rss+xml') {
                 if (!x($ret, 'feed_rss')) {
                     $ret['feed_rss'] = $item->getAttribute('href');
                 }
             }
         }
     }
     // Drupal and perhaps others only provide relative URL's. Turn them into absolute.
     if (x($ret, 'feed_atom') && !strstr($ret['feed_atom'], '://')) {
         $ret['feed_atom'] = $basename . $ret['feed_atom'];
     }
     if (x($ret, 'feed_rss') && !strstr($ret['feed_rss'], '://')) {
         $ret['feed_rss'] = $basename . $ret['feed_rss'];
     }
     return $ret;
 }
Example #13
0
 public static function trim($html, $maxLen = 25)
 {
     $html = strip_tags($html, '<img/>');
     require_once get_template_directory() . '/core/inc/HTML5/Parser.php';
     $dom = HTML5_Parser::parse($html);
     $html = new static();
     $toRemove = $html->walk($dom, $maxLen);
     // remove any nodes that passed our limit
     foreach ($toRemove as $child) {
         $child->parentNode->removeChild($child);
     }
     // remove wrapper tags added by DD (doctype, html...)
     if (version_compare(PHP_VERSION, '5.3.6') < 0) {
         $dom->removeChild($dom->firstChild);
         $dom->replaceChild($dom->firstChild->firstChild->firstChild, $dom->firstChild);
         return $dom->saveHTML();
     }
     return $dom->saveHTML($dom->getElementsByTagName('body')->item(0));
 }
/**
 * Main shortcode function
 *
 * @since 0.1
 */
function file_gallery_shortcode($content = false, $attr = false)
{
    global $file_gallery, $wpdb, $post;
    require_once 'html5lib/Parser.php';
    // if the function is called directly, not via shortcode
    if (false !== $content && false === $attr) {
        $attr = wp_parse_args($content);
    }
    if (!isset($file_gallery->gallery_id)) {
        $file_gallery->gallery_id = 1;
    } else {
        $file_gallery->gallery_id++;
    }
    $options = get_option('file_gallery');
    if (isset($options['cache']) && true == $options['cache']) {
        if ('html' == $attr['output_type'] || isset($options['cache_non_html_output']) && true == $options['cache_non_html_output']) {
            $transient = 'filegallery_' . md5($post->ID . "_" . serialize($attr));
            $cache = get_transient($transient);
            if ($cache) {
                return $cache;
            }
        }
    }
    // if option to show galleries in excerpts is set to false...
    // ...replace [gallery] with user selected text
    if (!is_singular() && (!isset($options['in_excerpt']) || true != $options['in_excerpt'])) {
        return $options['in_excerpt_replace_content'];
    }
    $default_templates = unserialize(FILE_GALLERY_DEFAULT_TEMPLATES);
    // We're trusting author input, so let's at least make sure it looks like a valid orderby statement
    if (isset($attr['orderby'])) {
        $attr['orderby'] = sanitize_sql_orderby($attr['orderby']);
        if (!$attr['orderby']) {
            unset($attr['orderby']);
        }
    }
    $defaults = array('order' => 'ASC', 'orderby' => '', 'id' => $post->ID, 'columns' => 3, 'size' => 'thumbnail', 'link' => 'attachment', 'include' => '', 'ids' => '', 'exclude' => '', 'template' => 'default', 'linkclass' => '', 'imageclass' => '', 'galleryclass' => '', 'rel' => 1, 'tags' => '', 'tags_from' => 'current', 'output_type' => 'html', 'output_params' => 1, 'attachment_ids' => '', 'mimetype' => '', 'limit' => -1, 'offset' => -1, 'paginate' => 0, 'link_size' => 'full', 'include_meta' => false);
    if (floatval(get_bloginfo('version')) >= 3.5) {
        $defaults['link'] = 'post';
    }
    // extract the defaults...
    extract(shortcode_atts($defaults, $attr));
    if (!in_array($template, $default_templates)) {
        $template_file = FILE_GALLERY_THEME_TEMPLATES_ABSPATH . '/' . $template . '/gallery.php';
        if (!is_readable($template_file)) {
            $template_file = FILE_GALLERY_CONTENT_TEMPLATES_ABSPATH . '/' . $template . '/gallery.php';
        }
    } else {
        if ('default' == $template) {
            $template_file = FILE_GALLERY_DEFAULT_TEMPLATE_ABSPATH . '/gallery.php';
            $template = FILE_GALLERY_DEFAULT_TEMPLATE_NAME;
        } else {
            $template_file = FILE_GALLERY_ABSPATH . '/templates/' . $template . '/gallery.php';
        }
    }
    // check if template exists and replace with default if it does not
    if (!is_readable($template_file)) {
        $template_file = FILE_GALLERY_ABSPATH . '/templates/default/gallery.php';
        $template = 'default';
    }
    // get overriding variables from the template file
    $overriding = true;
    ob_start();
    include $template_file;
    ob_end_clean();
    $overriding = false;
    if (is_array($file_gallery->overrides) && !empty($file_gallery->overrides)) {
        extract($file_gallery->overrides);
        $file_gallery->overrides = NULL;
    }
    $limit = (int) $limit;
    $offset = (int) $offset;
    $page = (int) get_query_var('page');
    // if( $captions === 'false' || $captions == '0' ) {
    // 	$captions = false;
    // }
    if ('false' === $rel || is_numeric($rel) && 0 === (int) $rel) {
        $_rel = false;
    } elseif (1 === $rel) {
        $_rel = true;
    } else {
        $_rel = $rel;
    }
    if ('false' === $output_params || is_numeric($output_params) && 0 === (int) $output_params) {
        $output_params = false;
    } else {
        $output_params = true;
    }
    if ('false' === $paginate || is_numeric($paginate) && 0 === (int) $paginate || 0 > $limit) {
        $paginate = false;
        $found_rows = '';
    } else {
        $paginate = true;
        $found_rows = 'SQL_CALC_FOUND_ROWS';
        if (0 === $page) {
            $page = 1;
        }
        if (is_singular() && 1 < $page) {
            $offset = $limit * ($page - 1);
        }
    }
    $file_gallery->debug_add('pagination', compact('paginate', 'page'));
    /**/
    $_attachment_ids = explode(',', trim($attachment_ids, ','));
    $_include = explode(',', trim($include, ','));
    $_ids = explode(',', trim($ids, ','));
    $attachment_ids = array_merge($_attachment_ids, $_include, $_ids);
    $attachment_ids = array_unique($attachment_ids);
    $attachment_ids = implode(',', $attachment_ids);
    $attachment_ids = trim($attachment_ids, ',');
    $attachment_ids = trim($attachment_ids);
    /**/
    if (!isset($linkto)) {
        $linkto = $link;
    }
    $sql_mimetype = '';
    if ('' != $mimetype) {
        $mimetype = file_gallery_get_mime_type($mimetype);
        $sql_mimetype = wp_post_mime_type_where($mimetype);
    }
    $approved_attachment_post_statuses = apply_filters('file_gallery_approved_attachment_post_statuses', array('inherit'));
    $ignored_attachment_post_statuses = apply_filters('file_gallery_ignored_attachment_post_statuses', array('trash', 'private', 'pending', 'future'));
    if (!empty($approved_attachment_post_statuses)) {
        $post_statuses = " AND (post_status IN ('" . implode("', '", $approved_attachment_post_statuses) . "') ) ";
    } elseif (!empty($ignored_attachment_post_statuses)) {
        $post_statuses = " AND (post_status NOT IN ('" . implode("', '", $ignored_attachment_post_statuses) . "') ) ";
    } else {
        $post_statuses = "";
    }
    $file_gallery_query = new stdClass();
    // start with tags because they negate everything else
    if ('' != $tags) {
        if ('' == $orderby || 'file_gallery' == $orderby) {
            $orderby = "menu_order ID";
        }
        $query = array('post_status' => implode(',', $approved_attachment_post_statuses), 'post_type' => 'attachment', 'order' => $order, 'orderby' => $orderby, 'posts_per_page' => $limit, 'post_mime_type' => $mimetype, FILE_GALLERY_MEDIA_TAG_NAME => $tags);
        if ('current' == $tags_from) {
            $query['post_parent'] = $id;
        }
        if (!empty($exclude)) {
            $query['post__not_in'] = explode(',', preg_replace('/[^0-9,]+/', '', $exclude));
        }
        if (0 < $offset) {
            $query['offset'] = $offset;
        }
        $file_gallery_query = new WP_Query($query);
        $attachments = $file_gallery_query->posts;
        unset($query);
    } elseif ('' != $attachment_ids) {
        $attachment_ids = explode(',', $attachment_ids);
        $sql_limit = count($attachment_ids);
        if ('rand' == $orderby) {
            shuffle($attachment_ids);
        }
        $attachment_ids = implode(',', $attachment_ids);
        if ('' == $orderby || 'rand' == $orderby || $orderby == 'post__in') {
            $orderby = sprintf("FIELD(ID, %s)", $attachment_ids);
            $order = '';
        } elseif ('title' == $orderby) {
            $orderby = "post_title";
        }
        $query = sprintf("SELECT " . $found_rows . " * FROM {$wpdb->posts} \r\n\t\t\t WHERE ID IN (%s) \r\n\t\t\t AND post_type = 'attachment' \r\n\t\t\t" . $post_statuses . " ", $attachment_ids);
        $query .= $sql_mimetype;
        $query .= sprintf(" ORDER BY %s %s ", $orderby, $order);
        if (true !== $paginate) {
            $limit = $sql_limit;
        }
    } else {
        if ('' == $orderby) {
            $orderby = "menu_order ID";
        }
        $query = array('post_parent' => $id, 'post_status' => implode(',', $approved_attachment_post_statuses), 'post_type' => 'attachment', 'order' => $order, 'orderby' => $orderby, 'posts_per_page' => $limit, 'post_mime_type' => $mimetype);
        if (!empty($exclude)) {
            $query['post__not_in'] = explode(',', preg_replace('/[^0-9,]+/', '', $exclude));
        }
        if (0 < $offset) {
            $query['offset'] = $offset;
        }
        $file_gallery_query = new WP_Query($query);
        $attachments = $file_gallery_query->posts;
        unset($query);
    }
    if (isset($query)) {
        if (0 < $limit) {
            $query .= " LIMIT " . $limit;
        }
        if (0 < $offset) {
            $query .= " OFFSET " . $offset;
        }
        $attachments = $wpdb->get_results($query);
        if ('' != $found_rows) {
            $file_gallery_query->found_posts = $wpdb->get_var("SELECT FOUND_ROWS()");
            $file_gallery_query->max_num_pages = ceil($file_gallery_query->found_posts / $limit);
        }
    }
    $file_gallery->debug_add('attachments_query', compact('file_gallery_query'));
    if (empty($attachments)) {
        return '<!-- "File Gallery" plugin says: - No attachments found for the following shortcode arguments: "' . json_encode($attr) . '" -->';
    }
    // feed
    if (is_feed()) {
        $output = "\n";
        foreach ($attachments as $attachment) {
            $output .= wp_get_attachment_link($attachment->ID, $size, true) . "\n";
        }
        return $output;
    }
    $i = 0;
    $unique_ids = array();
    $gallery_items = '';
    if ('object' == $output_type || 'array' == $output_type) {
        $gallery_items = array();
    }
    $autoqueueclasses = array();
    if (defined('FILE_GALLERY_LIGHTBOX_CLASSES')) {
        $autoqueueclasses = maybe_unserialize(FILE_GALLERY_LIGHTBOX_CLASSES);
    } else {
        $autoqueueclasses = explode(',', $options['auto_enqueued_scripts']);
    }
    $file_gallery_this_template_counter = 1;
    // create output
    foreach ($attachments as $attachment) {
        $param = array('image_class' => $imageclass, 'link_class' => $linkclass, 'rel' => $_rel, 'title' => '', 'caption' => '', 'description' => '', 'thumb_alt' => '');
        $attachment_file = get_attached_file($attachment->ID);
        $attachment_is_image = file_gallery_file_is_displayable_image($attachment_file);
        $startcol = '';
        $endcol = '';
        $x = '';
        if ($output_params) {
            $plcai = array_intersect($autoqueueclasses, explode(' ', trim($linkclass)));
            if (!empty($plcai)) {
                if ($attachment_is_image) {
                    if (true === $param['rel']) {
                        $param['rel'] = $plcai[0] . '[' . $file_gallery->gallery_id . ']';
                    } elseif (!is_bool($param['rel'])) {
                        if (false !== strpos($_rel, '$GID$')) {
                            $param['rel'] = str_replace('$GID$', $file_gallery->gallery_id, $_rel);
                        } else {
                            $param['rel'] = $_rel . '[' . $file_gallery->gallery_id . ']';
                        }
                    }
                    $filter_args = array('gallery_id' => $file_gallery->gallery_id, 'linkrel' => $param['rel'], 'linkclass' => $param['link_class'], 'imageclass' => $param['image_class']);
                    if ($param['rel']) {
                        $param['rel'] = apply_filters('file_gallery_lightbox_linkrel', $param['rel'], 'linkrel', $filter_args);
                    }
                    $param['link_class'] = apply_filters('file_gallery_lightbox_linkclass', $param['link_class'], 'linkclass', $filter_args);
                    $param['image_class'] = apply_filters('file_gallery_lightbox_imageclass', $param['image_class'], 'imageclass', $filter_args);
                } else {
                    $param['link_class'] = str_replace(trim(implode(' ', $plcai)), '', trim($linkclass));
                }
            }
            // if rel is still true or false
            if (is_bool($param['rel'])) {
                $param['rel'] = '';
            }
            switch ($linkto) {
                case 'parent_post':
                    $param['link'] = get_permalink($wpdb->get_var("SELECT post_parent FROM {$wpdb->posts} WHERE ID = '" . $attachment->ID . "'"));
                    break;
                case 'file':
                    $param['link'] = wp_get_attachment_url($attachment->ID);
                    break;
                case 'attachment':
                case 'post':
                    $param['link'] = get_attachment_link($attachment->ID);
                    break;
                case 'none':
                    $param['link'] = '';
                    break;
                default:
                    // external url
                    $param['link'] = urldecode($linkto);
                    break;
            }
            $param['title'] = $attachment->post_title;
            // $param['caption'] = $captions !== false ? $attachment->post_excerpt : '';
            $param['caption'] = $attachment->post_excerpt;
            $param['description'] = $attachment->post_content;
            if ($attachment_is_image) {
                $thumb_src = wp_get_attachment_image_src($attachment->ID, $size);
                $param['thumb_link'] = $thumb_src[0];
                $param['thumb_width'] = 0 == $thumb_src[1] ? file_gallery_get_image_size($param['thumb_link']) : $thumb_src[1];
                $param['thumb_height'] = 0 == $thumb_src[2] ? file_gallery_get_image_size($param['thumb_link'], true) : $thumb_src[2];
                if ('' != $param['link'] && 'full' != $link_size && in_array($link_size, file_gallery_get_intermediate_image_sizes())) {
                    $full_src = wp_get_attachment_image_src($attachment->ID, $link_size);
                    $param['link'] = $full_src[0];
                }
            } else {
                $param['thumb_link'] = wp_mime_type_icon($attachment->ID);
                $param['thumb_link'] = apply_filters('file_gallery_non_image_thumb_link', $param['thumb_link'], $attachment->post_mime_type, $attachment->ID);
                $param['thumb_width'] = '46';
                $param['thumb_height'] = '60';
            }
            if ($thumb_alt = get_post_meta($attachment->ID, '_wp_attachment_image_alt', true)) {
                $param['thumb_alt'] = $thumb_alt;
            }
            $param['attachment_id'] = $attachment->ID;
        }
        /**
         * Make sure that all attributes added/filtered via
         * 'wp_get_attachment_link' filter are included here as well
         */
        /**
        			$dom_document = new DOMDocument();
        			@$dom_document->loadHTML(wp_get_attachment_link($attachment->ID)); //
        			$wp_attachment_link_attributes = $dom_document->getElementsByTagName('a')->item(0)->attributes;
        		**/
        /**
        		$wp_attachment_link = new SimpleXMLElement(wp_get_attachment_link($attachment->ID));
        		$wp_attachment_link_attributes = $wp_attachment_link->attributes();
        
        		foreach( $wp_attachment_link_attributes as $key => $val )
        		{
        			if( $key === 'title' ) {
        				$param['title'] = $val;
        			}
        			else if( $key === 'class' ) {
        				$param['link_class'] .= ' ' . $val;
        			}
        			else if( $key === 'rel' ) {
        				$param['rel'] .= ' ' . $val;
        			}
        		}
        		**/
        $dom_document = HTML5_Parser::parse(wp_get_attachment_link($attachment->ID));
        $wp_attachment_link_attributes = $dom_document->getElementsByTagName("a")->item(0)->attributes;
        $length = $wp_attachment_link_attributes->length;
        for ($i = 0; $i < $length; ++$i) {
            $name = $wp_attachment_link_attributes->item($i)->name;
            $value = $wp_attachment_link_attributes->item($i)->value;
            if ($name === 'title') {
                $param['title'] = $value;
            } else {
                if ($name === 'class') {
                    $param['link_class'] .= ' ' . $value;
                } else {
                    if ($name === 'rel') {
                        $param['rel'] .= ' ' . $value;
                    }
                }
            }
        }
        $param = array_map('trim', $param);
        if ($include_meta) {
            $meta = get_post_custom($attachment->ID);
        }
        if ('object' == $output_type) {
            if ($output_params) {
                $attachment->params = (object) $param;
            }
            if ($include_meta) {
                $attachment->meta = (object) $meta;
            }
            $gallery_items[] = $attachment;
        } elseif ('array' == $output_type || 'json' == $output_type) {
            if ($output_params) {
                $attachment->params = $param;
            }
            if ($include_meta) {
                $attachment->meta = $meta;
            }
            $gallery_items[] = get_object_vars($attachment);
        } else {
            if ($columns > 0) {
                if (0 === $i || 0 === $i % $columns) {
                    $startcol = ' gallery-startcol';
                } elseif (($i + 1) % $columns == 0) {
                    // add the column break class
                    $endcol = ' gallery-endcol';
                }
            }
            // parse template
            ob_start();
            extract($param);
            include $template_file;
            $x = ob_get_contents();
            ob_end_clean();
            $file_gallery_this_template_counter++;
            if ($columns > 0 && $i + 1 % $columns == 0) {
                $x .= $cleartag;
            }
            $gallery_items .= $x;
            $i++;
        }
    }
    // handle data types
    if ('object' == $output_type || 'array' == $output_type) {
        $output = $gallery_items;
    } elseif ('json' == $output_type) {
        $output = json_encode($gallery_items);
    } else {
        $stc = '';
        $cols = '';
        $pagination_html = '';
        if (0 < (int) $columns) {
            $cols = ' columns_' . $columns;
        }
        if (isset($starttag_class) && '' != $starttag_class) {
            $stc = ' ' . $starttag_class;
        }
        $trans_append = "\n<!-- file gallery output cached on " . date('Y.m.d @ H:i:s', time()) . "-->\n";
        if (is_singular() && isset($file_gallery_query->max_num_pages) && 1 < $file_gallery_query->max_num_pages) {
            $pagination_html = file_gallery_do_pagination($file_gallery_query->max_num_pages, $page);
        }
        $gallery_class = apply_filters('file_gallery_galleryclass', 'gallery ' . str_replace(' ', '-', $template) . $cols . $stc . ' ' . $galleryclass);
        $output = '<' . $starttag . ' id="gallery-' . $file_gallery->gallery_id . '" class="' . $gallery_class . '">' . "\n" . $gallery_items . "\n" . $pagination_html . "\n</" . $starttag . '>';
    }
    if (isset($options['cache']) && true == $options['cache']) {
        if ('html' == $output_type) {
            set_transient($transient, $output . $trans_append, $options['cache_time']);
        } elseif (isset($options['cache_non_html_output']) && true == $options['cache_non_html_output']) {
            set_transient($transient, $output, $options['cache_time']);
        }
    }
    return apply_filters('file_gallery_output', $output, $post->ID, $file_gallery->gallery_id);
}
Example #15
0
 /**
  * Load HTML in a DOMDocument.
  * Apply Pre filters
  * Cleanup HTML using Tidy (or not).
  *
  * @todo This should be called in init() instead of from __construct
  */
 private function loadHtml()
 {
     $this->original_html = $this->html;
     $this->logger->debug('Parsing URL: ' . $this->url);
     if ($this->url) {
         $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($this->url, PHP_URL_HOST)), array('.' => '\\.')) . '/';
     }
     mb_internal_encoding('UTF-8');
     mb_http_output('UTF-8');
     mb_regex_encoding('UTF-8');
     // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
     if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
         foreach ($this->pre_filters as $search => $replace) {
             $this->html = preg_replace($search, $replace, $this->html);
         }
         unset($search, $replace);
     }
     if (trim($this->html) === '') {
         $this->html = '<html></html>';
     }
     /*
      * Use tidy (if it exists).
      * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
      * Although sometimes it makes matters worse, which is why there is an option to disable it.
      */
     if ($this->useTidy) {
         $this->logger->debug('Tidying document');
         $tidy = tidy_parse_string($this->html, $this->tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $this->tidied = true;
             $this->html = $tidy->value;
             $this->html = preg_replace('/[\\r\\n]+/is', "\n", $this->html);
         }
         unset($tidy);
     }
     $this->html = mb_convert_encoding($this->html, 'HTML-ENTITIES', 'UTF-8');
     if (!($this->parser === 'html5lib' && ($this->dom = \HTML5_Parser::parse($this->html)))) {
         libxml_use_internal_errors(true);
         $this->dom = new \DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         if (PHP_VERSION_ID >= 50400) {
             $this->dom->loadHTML($this->html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
         } else {
             $this->dom->loadHTML($this->html);
         }
         libxml_use_internal_errors(false);
     }
     $this->dom->registerNodeClass('DOMElement', 'Readability\\JSLikeHTMLElement');
 }
Example #16
0
function parse_url_content(&$a)
{
    $text = null;
    $str_tags = '';
    if (x($_GET, 'binurl')) {
        $url = trim(hex2bin($_GET['binurl']));
    } else {
        $url = trim($_GET['url']);
    }
    if ($_GET['title']) {
        $title = strip_tags(trim($_GET['title']));
    }
    if ($_GET['description']) {
        $text = strip_tags(trim($_GET['description']));
    }
    if ($_GET['tags']) {
        $arr_tags = str_getcsv($_GET['tags']);
        if (count($arr_tags)) {
            array_walk($arr_tags, 'arr_add_hashes');
            $str_tags = '<br />' . implode(' ', $arr_tags) . '<br />';
        }
    }
    logger('parse_url: ' . $url);
    $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
    $arr = array('url' => $url, 'text' => '');
    call_hooks('parse_link', $arr);
    if (strlen($arr['text'])) {
        echo $arr['text'];
        killme();
    }
    if ($url && $title && $text) {
        $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
        $title = str_replace(array("\r", "\n"), array('', ''), $title);
        $result = sprintf($template, $url, $title ? $title : $url, $text) . $str_tags;
        logger('parse_url (unparsed): returns: ' . $result);
        echo $result;
        killme();
    }
    if ($url) {
        $s = fetch_url($url);
    } else {
        echo '';
        killme();
    }
    //	logger('parse_url: data: ' . $s, LOGGER_DATA);
    if (!$s) {
        echo sprintf($template, $url, $url, '') . $str_tags;
        killme();
    }
    $matches = '';
    $c = preg_match('/\\<head(.*?)\\>(.*?)\\<\\/head\\>/ism', $s, $matches);
    if ($c) {
        //		logger('parse_url: header: ' . $matches[2], LOGGER_DATA);
        try {
            $domhead = HTML5_Parser::parse($matches[2]);
        } catch (DOMException $e) {
            logger('scrape_dfrn: parse error: ' . $e);
        }
        if ($domhead) {
            logger('parsed header');
        }
    }
    if (!$title) {
        if (strpos($s, '<title>')) {
            $title = substr($s, strpos($s, '<title>') + 7, 64);
            if (strpos($title, '<') !== false) {
                $title = strip_tags(substr($title, 0, strpos($title, '<')));
            }
        }
    }
    $config = HTMLPurifier_Config::createDefault();
    $config->set('Cache.DefinitionImpl', null);
    $purifier = new HTMLPurifier($config);
    $s = $purifier->purify($s);
    //	logger('purify_output: ' . $s);
    try {
        $dom = HTML5_Parser::parse($s);
    } catch (DOMException $e) {
        logger('scrape_dfrn: parse error: ' . $e);
    }
    if (!$dom) {
        echo sprintf($template, $url, $url, '') . $str_tags;
        killme();
    }
    $items = $dom->getElementsByTagName('title');
    if ($items) {
        foreach ($items as $item) {
            $title = trim($item->textContent);
            break;
        }
    }
    if (!$text) {
        $divs = $dom->getElementsByTagName('div');
        if ($divs) {
            foreach ($divs as $div) {
                $class = $div->getAttribute('class');
                if ($class && (stristr($class, 'article') || stristr($class, 'content'))) {
                    $items = $div->getElementsByTagName('p');
                    if ($items) {
                        foreach ($items as $item) {
                            $text = $item->textContent;
                            if (stristr($text, '<script')) {
                                $text = '';
                                continue;
                            }
                            $text = strip_tags($text);
                            if (strlen($text) < 100) {
                                $text = '';
                                continue;
                            }
                            $text = substr($text, 0, 250) . '...';
                            break;
                        }
                    }
                }
                if ($text) {
                    break;
                }
            }
        }
        if (!$text) {
            $items = $dom->getElementsByTagName('p');
            if ($items) {
                foreach ($items as $item) {
                    $text = $item->textContent;
                    if (stristr($text, '<script')) {
                        continue;
                    }
                    $text = strip_tags($text);
                    if (strlen($text) < 100) {
                        $text = '';
                        continue;
                    }
                    $text = substr($text, 0, 250) . '...';
                    break;
                }
            }
        }
    }
    if (!$text) {
        logger('parsing meta');
        $items = $domhead->getElementsByTagName('meta');
        if ($items) {
            foreach ($items as $item) {
                $property = $item->getAttribute('property');
                if ($property && stristr($property, ':description')) {
                    $text = $item->getAttribute('content');
                    if (stristr($text, '<script')) {
                        $text = '';
                        continue;
                    }
                    $text = strip_tags($text);
                    $text = substr($text, 0, 250) . '...';
                }
                if ($property && stristr($property, ':image')) {
                    $image = $item->getAttribute('content');
                    if (stristr($text, '<script')) {
                        $image = '';
                        continue;
                    }
                    $image = strip_tags($image);
                    $i = fetch_url($image);
                    if ($i) {
                        require_once 'include/Photo.php';
                        $ph = new Photo($i);
                        if ($ph->is_valid()) {
                            if ($ph->getWidth() > 300 || $ph->getHeight() > 300) {
                                $ph->scaleImage(300);
                                $new_width = $ph->getWidth();
                                $new_height = $ph->getHeight();
                                $image = '<br /><br /><img height="' . $new_height . '" width="' . $new_width . '" src="' . $image . '" alt="photo" />';
                            } else {
                                $image = '<br /><br /><img src="' . $image . '" alt="photo" />';
                            }
                        } else {
                            $image = '';
                        }
                    }
                }
            }
        }
    }
    if (strlen($text)) {
        $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
    }
    if ($image) {
        $text = $image . '<br />' . $text;
    }
    $title = str_replace(array("\r", "\n"), array('', ''), $title);
    $result = sprintf($template, $url, $title ? $title : $url, $text) . $str_tags;
    logger('parse_url: returns: ' . $result);
    echo $result;
    killme();
}
Example #17
0
 function scrape_vcard($url)
 {
     $a = get_app();
     $ret = array();
     logger('scrape_vcard: url=' . $url);
     $s = fetch_url($url);
     if (!$s) {
         return $ret;
     }
     $headers = $a->get_curl_headers();
     $lines = explode("\n", $headers);
     if (count($lines)) {
         foreach ($lines as $line) {
             // don't try and run feeds through the html5 parser
             if (stristr($line, 'content-type:') && (stristr($line, 'application/atom+xml') || stristr($line, 'application/rss+xml'))) {
                 return ret;
             }
         }
     }
     try {
         $dom = HTML5_Parser::parse($s);
     } catch (DOMException $e) {
         logger('scrape_vcard: parse error: ' . $e);
     }
     if (!$dom) {
         return $ret;
     }
     // Pull out hCard profile elements
     $largest_photo = 0;
     $items = $dom->getElementsByTagName('*');
     foreach ($items as $item) {
         if (attribute_contains($item->getAttribute('class'), 'vcard')) {
             $level2 = $item->getElementsByTagName('*');
             foreach ($level2 as $x) {
                 if (attribute_contains($x->getAttribute('class'), 'fn')) {
                     $ret['fn'] = $x->textContent;
                 }
                 if (attribute_contains($x->getAttribute('class'), 'photo') || attribute_contains($x->getAttribute('class'), 'avatar')) {
                     $size = intval($x->getAttribute('width'));
                     if ($size > $largest_photo || !$largest_photo) {
                         $ret['photo'] = $x->getAttribute('src');
                         $largest_photo = $size;
                     }
                 }
                 if (attribute_contains($x->getAttribute('class'), 'nickname') || attribute_contains($x->getAttribute('class'), 'uid')) {
                     $ret['nick'] = $x->textContent;
                 }
             }
         }
     }
     return $ret;
 }
Example #18
0
 /**
  * Create instance of Readability
  * @param string UTF-8 encoded string
  * @param string (optional) URL associated with HTML (for footnotes)
  * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument
  * @param boolean (optional) Use tidy
  */
 function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true)
 {
     $this->url = $url;
     $this->debugText = 'Parsing URL: ' . $url . "\n";
     if ($url) {
         $this->domainRegExp = '/' . strtr(preg_replace('/www\\d*\\./', '', parse_url($url)['host']), array('.' => '\\.')) . '/';
     }
     mb_internal_encoding("UTF-8");
     mb_http_output("UTF-8");
     mb_regex_encoding("UTF-8");
     $this->imageCache = new ImageCaching();
     // HACK: dirty cleanup to replace some stuff; shouldn't use regexps with HTML but well...
     if (!$this->flagIsActive(self::FLAG_DISABLE_PREFILTER)) {
         try {
             foreach ($this->pre_filters as $search => $replace) {
                 $html = preg_replace($search, $replace, $html);
             }
             unset($search, $replace);
         } catch (Exception $e) {
             $this->debugText .= "Cleaning raw HTML failed. Ignoring: " . $e->getMessage();
         }
     }
     if (trim($html) === '') {
         $html = '<html></html>';
     }
     /**
      * Use tidy (if it exists).
      * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing.
      * Although sometimes it makes matters worse, which is why there is an option to disable it.
      *
      **/
     if ($use_tidy && function_exists('tidy_parse_string')) {
         $this->debugText .= 'Tidying document' . "\n";
         $tidy = tidy_parse_string($html, $this->tidy_config, 'UTF8');
         if (tidy_clean_repair($tidy)) {
             $original_html = $html;
             $this->tidied = true;
             $html = $tidy->value;
             $html = preg_replace('/<html[^>]+>/i', '<html>', $html);
             $html = preg_replace('/[\\r\\n]+/is', "\n", $html);
         }
         unset($tidy);
     }
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     if ($parser == 'html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
         // all good
     } else {
         libxml_use_internal_errors(true);
         $this->dom = new DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         @$this->dom->loadHTML($html, LIBXML_NOBLANKS | LIBXML_COMPACT | LIBXML_NOERROR);
     }
     $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 }
function cc_wordpress_article_filter($article)
{
    require_once 'lib/html5lib/Parser.php';
    // sorry, but parseFragment() returns a DomNodeList, which is as inflexible as it gets
    $dom = HTML5_Parser::parse($article);
    $tagnames = array('img', 'audio', 'video', 'object');
    foreach ($tagnames as $tagname) {
        foreach ($dom->getElementsByTagName($tagname) as $element) {
            $class = $element->getAttribute('class');
            // relevant class name example: wp-image-18
            preg_match('/wp-(image|audio|video|object)-([0-9]*)/', $class, $matches);
            $id = $matches[2];
            // relevant class name example: size-medium
            preg_match('/size-(.*)/', $class, $matches);
            $size = $matches[1];
            // TODO: make cc_wordpress_figure() take and return a DOM fragment
            $figure_html = cc_wordpress_figure($id, $size, false);
            // only replace node if we actually got something
            if ($figure_html) {
                $figure = HTML5_Parser::parseFragment($figure_html)->item(0)->getElementsByTagName('figure')->item(0);
                // a document context change is needed before appending the node
                $figure = $dom->importNode($figure, True);
                $element->parentNode->replaceChild($figure, $element);
            }
        }
    }
    // hackish but reliable way to serialize the DOM
    // TODO: fix this mess
    $XML = $dom->saveXML($dom->getElementsByTagName('body')->item(0));
    $XML = str_replace('<body>', '', $XML);
    $XML = str_replace('</body>', '', $XML);
    // work around a bug regarding <style> elements including CSS '>' selectors
    $XML = str_replace('&gt;', '>', $XML);
    // work around the IE bug that some elements are serialized with a null namespace
    $XML = str_replace('embedNode.value = helperNode.innerHTML;', 'embedNode.value = helperNode.innerHTML.replace(/<:/g,"<").replace(/<.:/g,"</");', $XML);
    return $XML;
}