function clean_url($url) { $bits = crack_url($url); // force http (but allow https too) if (strtolower($bits['scheme']) != 'https') { $bits['scheme'] = 'http'; } return glue_url($bits); }
function buildurl($url = false, $qs = '') { $url or $url = $_SERVER["REQUEST_URI"]; $urlA = parse_url($url); parse_str($urlA['query'], $query1); $query2 = $qs; is_array($qs) or parse_str($qs, $query2); $query = array_merge($query1, $query2); $urlA['query'] = http_build_query($query); $nurl = glue_url($urlA); return $nurl ? $nurl : $url; }
/** * Enter description here... * * @param unknown_type $base * @param unknown_type $url * @return unknown * @author adrian-php at sixfingeredman dot net */ function resolve_url($base, $url) { if (!strlen($base)) { return $url; } // Step 2 if (!strlen($url)) { return $base; } // Step 3 if (preg_match('!^[a-z]+:!i', $url)) { return $url; } $base = parse_url($base); if ($url[0] == "#") { // Step 2 (fragment) $base['fragment'] = substr($url, 1); return unparse_url($base); } unset($base['fragment']); unset($base['query']); if (substr($url, 0, 2) == "//") { // Step 4 return unparse_url(array('scheme' => $base['scheme'], 'path' => substr($url, 2))); } else { if ($url[0] == "/") { // Step 5 $base['path'] = $url; } else { // Step 6 $path = explode('/', $base['path']); $url_path = explode('/', $url); // Step 6a: drop file from base array_pop($path); // Step 6b, 6c, 6e: append url while removing "." and ".." from // the directory portion $end = array_pop($url_path); foreach ($url_path as $segment) { if ($segment == '.') { // skip } else { if ($segment == '..' && $path && $path[sizeof($path) - 1] != '..') { array_pop($path); } else { $path[] = $segment; } } } // Step 6d, 6f: remove "." and ".." from file portion if ($end == '.') { $path[] = ''; } else { if ($end == '..' && $path && $path[sizeof($path) - 1] != '..') { $path[sizeof($path) - 1] = ''; } else { $path[] = $end; } } // Step 6h $base['path'] = join('/', $path); } } // Step 7 return glue_url($base); }
function no_password_proxy_url($http_proxy) { if ($p = @parse_url($http_proxy) AND $p['pass']) { $p['pass'] = '******'; $http_proxy = glue_url($p); } return $http_proxy; }
/** * Get PukiWiki Page Source via http using cmd=source * @access public * @param string $url Page URL * @return mixed PukiWiki Page Source. FALSE if HTTP GET failed. * @uses PKWKSourceHandler * @uses PEAR XML/XML_HTMLSax.php */ function &pkwk_get_source($url) { // pkwk source (cmd=source&page=PAGE) $parsed = parse_url($url); $queries = array(); parse_str($parsed['query'], $queries); // rawurldecode $page = isset($queries['page']) ? $queries['page'] : rawurldecode($parsed['query']); $queries = array(); $queries['cmd'] = 'source'; $queries['page'] = $page; $parsed['query'] = glue_str($queries); $url = glue_url($parsed); if (($html = http_get_contents($url, $GLOBALS['USERNAME'], $GLOBALS['USERPASS'])) === FALSE) { return FALSE; } require_once 'XML/XML_HTMLSax.php'; $parser = new XML_HTMLSax(); $handler = new PKWKSourceHandler(); $parser->set_object($handler); $parser->set_element_handler('openHandler', 'closeHandler'); $parser->set_data_handler('dataHandler'); $parser->parse($html); return $handler->source; }
function parse_wordpress($url, $page_source) { // feed url $url_parsed = parse_url($url); if (isset($url_parsed['query'])) { $url_parsed['query'] .= '&feed=rss&withoutcomments=1'; } else { $url_parsed['query'] = 'feed=rss2&withoutcomments=1'; } $single_post_feed_url = glue_url($url_parsed); // get feed source $single_post_feed_source = get_source_width_curl($single_post_feed_url); $single_post_feed_source = $single_post_feed_source["content"]; // fallback parser if not full content in single post feed if (!stristr($single_post_feed_source, "<content:encoded>")) { return fallback_parser($url, $page_source); } else { // content $xml_parsed = simplexml_load_string($single_post_feed_source, null, LIBXML_NOCDATA); $link = $xml_parsed->channel->item->link; $h1 = $xml_parsed->channel->item->title; $title = $xml_parsed->channel->title; $content_decoded = $xml_parsed->channel->item->children('content', true)->encoded; // parse content $content = str_get_html((string) $content_decoded[0]); // remove facebook iframes $i = 0; foreach ($content->find("iframe") as $iframe) { if (strstr($iframe->src, 'facebook.com')) { $content->find("iframe", $i)->outertext = ''; } $i++; } // remove flattr-links $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'flattrss_redirect')) { $content->find("a", $i)->outertext = ''; } $i++; } // remove wordpress.com feeds $i = 0; foreach ($content->find("a") as $a) { if (strstr($a->href, 'feeds.wordpress.com')) { $content->find("a", $i)->outertext = ''; } $i++; } // loop through elements and do stuff function do_stuff($element) { // remove all attributes, except href and src foreach ($element->attr as $name => $attr) { if ($name != 'src' && $name != 'href') { $element->removeAttribute($name); } } // we keep a, img, object, embed, h1, h2, h3, h4, h5, h6, i, em, b, strong, blockquote // all other converted to p if ($element->tag != 'img' && $element->tag != 'span' && $element->tag != 'a' && $element->tag != 'li' && $element->tag != 'ul' && $element->tag != 'ol' && $element->tag != 'object' && $element->tag != 'embed' && $element->tag != 'h1' && $element->tag != 'h2' && $element->tag != 'h3' && $element->tag != 'h4' && $element->tag != 'h5' && $element->tag != 'h6' && $element->tag != 'i' && $element->tag != 'em' && $element->tag != 'b' && $element->tag != 'strong' && $element->tag != 'blockquote') { $element->tag = 'p'; } // remove p:s without innertext if ($element->tag == 'p' && $element->innertext == null) { $element->outertext = ''; } // remove a:s without innertext if ($element->tag == 'a' && $element->innertext == null) { $element->outertext = ''; } // remove wordpress.com stats if (strstr($element->src, 'stats.wordpress.com')) { $element->outertext = ''; } } $content->set_callback('do_stuff'); $content = $content->save(); // no indent before image, if first in p $content = str_replace('<p><a', '<p> <a', $content); $content = str_replace('<p><img', '<p> <img', $content); // wrap in article structure return '<div class="article"><h1>' . $h1 . '</h1>' . $content . '<address><a href="' . $link . '">' . $title . '</a></address></div>'; } }
/** * Get absolute URL * * PHP Extension * * @access public * @param string $base base url * @param string $url relative url * @return string absolute url * @see parse_url() * @see realpath() * @uses glue_url() */ function realurl($base, $url) { if (!strlen($base)) { return $url; } if (!strlen($url)) { return $base; } if (preg_match('!^[a-z]+:!i', $url)) { return $url; } $base = parse_url($base); if ($url[0] == "#") { // fragment $base['fragment'] = substr($url, 1); return glue_url($base); } unset($base['fragment']); unset($base['query']); if (substr($url, 0, 2) == "//") { // FQDN $base = array('scheme' => $base['scheme'], 'path' => substr($url, 2)); return glue_url($base); } elseif ($url[0] == "/") { // absolute path reference $base['path'] = $url; } else { // relative path reference $path = explode('/', $base['path']); $url_path = explode('/', $url); // drop file from base array_pop($path); // append url while removing "." and ".." from // the directory portion $end = array_pop($url_path); foreach ($url_path as $segment) { if ($segment == '.') { // skip } elseif ($segment == '..' && $path && $path[sizeof($path) - 1] != '..') { array_pop($path); } else { $path[] = $segment; } } // remove "." and ".." from file portion if ($end == '.') { $path[] = ''; } elseif ($end == '..' && $path && $path[sizeof($path) - 1] != '..') { $path[sizeof($path) - 1] = ''; } else { $path[] = $end; } $base['path'] = join('/', $path); } return glue_url($base); }