/** * Parse a string for plaintext URLs * * @param string $content The text to parse. * @param string $base_url The base URL. Ignored. * @param string $default_link_text Default link text. * @return array An array of new blcLinkInstance objects. */ function parse($content, $base_url = '', $default_link_text = '') { //Don't want to detect URLs inside links or tag attributes - //there are already other parsers for that. //Avoid <a href="http://...">http://...</a> $content = preg_replace('#<a[^>]*>.*?</a>#si', '', $content); //HTML tags are treated as natural boundaries for plaintext URLs //(since we strip tags, we must place another boundary char where they were). //The closing tag of [shortcodes] is also treated as a boundary. $content = str_replace(array('<', '>', '[/'), array("\n<", ">\n", "\n[/"), $content); //Finally, kill all tags. $content = strip_tags($content); //Find all URLs $found = preg_match_all($this->url_regexp, $content, $matches); $instances = array(); if ($found) { //Create a new instance for each match foreach ($matches[2] as $match) { $url = $this->validate_url(trim($match)); if ($url == false) { continue; } //Create a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $match; $instance->link_text = $match; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } } return $instances; }
/** * Parse a string for embed codes. * * @param string $content The text to parse. * @param string $base_url The base URL. Ignored. * @param string $default_link_text Default link text. Ignored. * @return array An array of new blcLinkInstance objects. The objects will include info about the embeds found, but not about the corresponding container entity. */ function parse($content, $base_url = '', $default_link_text = '') { $instances = array(); //Find likely-looking <embed> elements $embeds = $this->extract_embeds($content); foreach ($embeds as $embed) { //Do we know how to handle this embed? (first-pass verification) if (strpos($embed['attributes']['src'], $this->url_search_string) === false) { continue; } //Get the original URL of the embedded object (may perform more complex verification) $url = $this->link_url_from_src($embed['attributes']['src']); if (empty($url)) { continue; } //Create a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $embed['embed_code']; $instance->link_text = '[' . $this->short_title . ']'; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } return $instances; }
/** * "Parse" an URL into an instance. * * @param string $content The entire content is expected to be a single plaintext URL. * @param string $base_url The base URL to use for normalizing relative URLs. If ommitted, the blog's root URL will be used. * @param string $default_link_text * @return array An array of new blcLinkInstance objects. */ function parse($content, $base_url = '', $default_link_text = '') { $instances = array(); $url = $raw_url = trim($content); //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { return $instances; //Ignore invalid URLs } if (!isset($parts['scheme'])) { //No sheme - likely a relative URL. Turn it into an absolute one. $url = $this->relative2absolute($url, $base_url); //Skip invalid URLs (again) if (!$url || strlen($url) < 6) { return $instances; } } //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = $default_link_text; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; return $instances; }
/** * Parse a string for HTML images - <img src="URL"> * * @param string $content The text to parse. * @param string $base_url The base URL to use for normalizing relative URLs. If omitted, the blog's root URL will be used. * @param string $default_link_text * @return array An array of new blcLinkInstance objects. The objects will include info about the links found, but not about the corresponding container entity. */ function parse($content, $base_url = '', $default_link_text = '') { global $blclog; $charset = get_bloginfo('charset'); if (strtoupper($charset) === 'UTF8') { $charset = 'UTF-8'; } $blclog->info('Blog charset is "' . $charset . '"'); $instances = array(); //remove all <code></code> blocks first $content = preg_replace('/<code[^>]*>.+?<\\/code>/si', ' ', $content); //Find images if (preg_match_all($this->img_pattern, $content, $matches, PREG_SET_ORDER)) { foreach ($matches as $link) { $url = $raw_url = $link[3]; //FB::log($url, "Found image"); $blclog->info('Found image. SRC attribute: "' . $raw_url . '"'); //Decode & and other entities $url = html_entity_decode($url, ENT_QUOTES, $charset); $blclog->info('Decoded image URL: "' . $url . '"'); $url = trim($url); $blclog->info('Trimmed image URL: "' . $url . '"'); //Allow shortcodes in image URLs. $url = do_shortcode($url); //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { continue; //Skip invalid URLs } if (!isset($parts['scheme'])) { //No scheme - likely a relative URL. Turn it into an absolute one. $relativeUrl = $url; $url = $this->relative2absolute($url, $base_url); $blclog->info(sprintf('%s:%s Resolving relative URL. Relative URL = "%s", base URL = "%s", result = "%s"', __CLASS__, __FUNCTION__, $relativeUrl, $base_url, $url)); } //Skip invalid URLs (again) if (!$url || strlen($url) < 6) { continue; } $blclog->info('Final URL: "' . $url . '"'); //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = ''; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } } return $instances; }
/** * Parse a string for plaintext URLs * * @param string $content The text to parse. * @param string $base_url The base URL. Ignored. * @param string $default_link_text Default link text. * @return array An array of new blcLinkInstance objects. */ function parse($content, $base_url = '', $default_link_text = '') { //Don't want to detect URLs inside links or tag attributes - //there are already other parsers for that. //Avoid <a href="http://...">http://...</a> $content = preg_replace('#<a[^>]*>.*?</a>#si', '', $content); //HTML tags are treated as natural boundaries for plaintext URLs //(since we strip tags, we must place another boundary char where they were). //The closing tag of [shortcodes] is also treated as a boundary. $content = str_replace(array('<', '>', '[/'), array("\n<", ">\n", "\n[/"), $content); //Finally, kill all tags. $content = strip_tags($content); //Find all URLs $found = preg_match_all($this->url_regexp, $content, $matches); $instances = array(); if ($found) { //Create a new instance for each match foreach ($matches[2] as $match) { //Do a little bit of validation $url = esc_url_raw(trim($match)); if (empty($url)) { continue; } if (function_exists('filter_var')) { //Note: filter_var() is no panacea as it accepts many invalid URLs if (!filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_HOST_REQUIRED)) { continue; } } $parts = @parse_url($url); if (empty($parts['host']) || !strpos($parts['host'], '.')) { continue; } //Create a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $match; $instance->link_text = $match; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } } return $instances; }
/** * Parse a string for HTML images - <img src="URL"> * * @param string $content The text to parse. * @param string $base_url The base URL to use for normalizing relative URLs. If ommitted, the blog's root URL will be used. * @param string $default_link_text * @return array An array of new blcLinkInstance objects. The objects will include info about the links found, but not about the corresponding container entity. */ function parse($content, $base_url = '', $default_link_text = '') { $instances = array(); //remove all <code></code> blocks first $content = preg_replace('/<code[^>]*>.+?<\\/code>/si', ' ', $content); //Find images if (preg_match_all($this->img_pattern, $content, $matches, PREG_SET_ORDER)) { foreach ($matches as $link) { $url = $raw_url = $link[3]; //FB::log($url, "Found image"); //Decode & and other entities $url = html_entity_decode($url); $url = trim($url); //Allow shortcodes in image URLs. $url = do_shortcode($url); //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { continue; //Skip invalid URLs } if (!isset($parts['scheme'])) { //No sheme - likely a relative URL. Turn it into an absolute one. $url = $this->relative2absolute($url, $base_url); } //Skip invalid URLs (again) if (!$url || strlen($url) < 6) { continue; } //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = ''; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } } return $instances; }
/** * Parse a metadata value. * * @param string|array $content Metadata value(s). * @param string $base_url The base URL to use for normalizing relative URLs. If ommitted, the blog's root URL will be used. * @param string $default_link_text * @return array An array of new blcLinkInstance objects. */ function parse($content, $base_url = '', $default_link_text = '') { $instances = array(); if (!is_array($content)) { $content = array($content); } foreach ($content as $value) { //The complete contents of the meta field are stored in raw_url. //This is useful for editing/unlinking, when one may need to //distinguish between multiple fields with the same name. $raw_url = $value; //If this is a multiline metadata field take only the first line (workaround for the 'enclosure' field). $lines = explode("\n", $value); $url = trim(reset($lines)); //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { return $instances; //Ignore invalid URLs } if (!isset($parts['scheme'])) { //No scheme - likely a relative URL. Turn it into an absolute one. $url = $this->relative2absolute($url, $base_url); //Skip invalid URLs (again) if (!$url || strlen($url) < 6) { return $instances; } } //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = $default_link_text; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); $instances[] = $instance; } return $instances; }
/** * blcHTMLLink::parser_callback() * * @access private * * @param array $link * @param array $params * @return blcLinkInstance|null */ function parser_callback($link, $params) { $base_url = $params['base_url']; $url = $raw_url = $link['href']; $url = trim($url); //FB::log($url, "Found link"); //Sometimes links may contain shortcodes. Execute them. $url = do_shortcode($url); //Skip empty URLs if (empty($url)) { return null; } //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { return null; //Skip invalid URLs } if (!isset($parts['scheme'])) { //No sheme - likely a relative URL. Turn it into an absolute one. $url = $this->relative2absolute($url, $base_url); //$base_url comes from $params } //Skip invalid links (again) if (!$url || strlen($url) < 6) { return null; } $text = strip_tags($link['#link_text']); //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = $text; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); return $instance; }
/** * blcHTMLLink::parser_callback() * * @access private * * @param array $link * @param array $params * @return blcLinkInstance|null */ function parser_callback($link, $params) { global $blclog; $base_url = $params['base_url']; $url = $raw_url = $link['href']; $url = trim($url); //$blclog->debug(__CLASS__ .':' . __FUNCTION__ . ' Found a link, raw URL = "' . $raw_url . '"'); //Sometimes links may contain shortcodes. Execute them. $url = do_shortcode($url); //Skip empty URLs if (empty($url)) { $blclog->warn(__CLASS__ . ':' . __FUNCTION__ . ' Skipping the link (empty URL)'); return null; } //Attempt to parse the URL $parts = @parse_url($url); if (!$parts) { $blclog->warn(__CLASS__ . ':' . __FUNCTION__ . ' Skipping the link (parse_url failed)', $url); return null; //Skip invalid URLs } if (!isset($parts['scheme'])) { //No scheme - likely a relative URL. Turn it into an absolute one. $url = $this->relative2absolute($url, $base_url); //$base_url comes from $params $blclog->info(__CLASS__ . ':' . __FUNCTION__ . ' Convert relative URL to absolute. Absolute URL = "' . $url . '"'); } //Skip invalid links (again) if (!$url || strlen($url) < 6) { $blclog->info(__CLASS__ . ':' . __FUNCTION__ . ' Skipping the link (invalid/short URL)', $url); return null; } //Remove left-to-right marks. See: https://en.wikipedia.org/wiki/Left-to-right_mark $ltrm = json_decode('"\\u200E"'); $url = str_replace($ltrm, '', $url); $text = $link['#link_text']; //The URL is okay, create and populate a new link instance. $instance = new blcLinkInstance(); $instance->set_parser($this); $instance->raw_url = $raw_url; $instance->link_text = $text; $link_obj = new blcLink($url); //Creates or loads the link $instance->set_link($link_obj); return $instance; }