/** * Gets the specified meta info from the given post content. * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction * This method will give you an error if you ask for IMAGES. * * @param $content The HTML post_content of a post * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS * @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error */ public static function extract_from_content($content, $what_to_extract = self::ALL, $already_extracted = array()) { $stripped_content = self::get_stripped_content($content); // Maybe start with some previously extracted things (e.g. images from extract() $extracted = $already_extracted; // Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save. if (self::IMAGES & $what_to_extract) { $images = Jetpack_Media_Meta_Extractor::extract_images_from_content($stripped_content); $extracted = array_merge($extracted, $images); } // ----------------------------------- MENTIONS ------------------------------ if (self::MENTIONS & $what_to_extract) { if (preg_match_all('/(^|\\s)@(\\w+)/u', $stripped_content, $matches)) { $mentions = array_values(array_unique($matches[2])); //array_unique() retains the keys! $mentions = array_map('strtolower', $mentions); $extracted['mention'] = array('name' => $mentions); if (!isset($extracted['has'])) { $extracted['has'] = array(); } $extracted['has']['mention'] = count($mentions); } } // ----------------------------------- HASHTAGS ------------------------------ /* Some hosts may not compile with --enable-unicode-properties and kick a warning Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled if ( self::HASHTAGS & $what_to_extract ) { //This regex does not exactly match Twitter's // if there are problems/complaints we should implement this: // https://github.com/twitter/twitter-text-java/blob/master/src/com/twitter/Regex.java if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) { $hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys! $hashtags = array_map( 'strtolower', $hashtags ); $extracted['hashtag'] = array( 'name' => $hashtags ); if ( !isset( $extracted['has'] ) ) $extracted['has'] = array(); $extracted['has']['hashtag'] = count( $hashtags ); } } */ // ----------------------------------- SHORTCODES ------------------------------ // Always look for shortcodes. // If we don't want them, we'll just remove them, so we don't grab them as links below $shortcode_pattern = '/' . get_shortcode_regex() . '/s'; if (preg_match_all($shortcode_pattern, $content, $matches)) { $shortcode_total_count = 0; $shortcode_type_counts = array(); $shortcode_types = array(); $shortcode_details = array(); if (self::SHORTCODES & $what_to_extract) { foreach ($matches[2] as $key => $shortcode) { //Elasticsearch (and probably other things) doesn't deal well with some chars as key names $shortcode_name = preg_replace('/[.,*"\'\\/\\\\#+ ]/', '_', $shortcode); $attr = shortcode_parse_atts($matches[3][$key]); $shortcode_total_count++; if (!isset($shortcode_type_counts[$shortcode_name])) { $shortcode_type_counts[$shortcode_name] = 0; } $shortcode_type_counts[$shortcode_name]++; // Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below) // @todo Store number of occurrences? if (!in_array($shortcode_name, $shortcode_types)) { $shortcode_types[] = $shortcode_name; } // For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.) if (in_array($shortcode, self::$KEEPER_SHORTCODES)) { unset($id); // Clear shortcode ID data left from the last shortcode // We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id() // If the shortcode is a class, we'll call XyzShortcode::get_xyz_id() $shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id"; $shortcode_class_name = ucfirst($shortcode) . 'Shortcode'; $shortcode_get_id_method = "get_{$shortcode}_id"; if (function_exists($shortcode_get_id_func)) { $id = call_user_func($shortcode_get_id_func, $attr); } else { if (method_exists($shortcode_class_name, $shortcode_get_id_method)) { $id = call_user_func(array($shortcode_class_name, $shortcode_get_id_method), $attr); } } if (!empty($id) && (!isset($shortcode_details[$shortcode_name]) || !in_array($id, $shortcode_details[$shortcode_name]))) { $shortcode_details[$shortcode_name][] = $id; } } } if ($shortcode_total_count > 0) { // Add the shortcode info to the $extracted array if (!isset($extracted['has'])) { $extracted['has'] = array(); } $extracted['has']['shortcode'] = $shortcode_total_count; $extracted['shortcode'] = array(); foreach ($shortcode_type_counts as $type => $count) { $extracted['shortcode'][$type] = array('count' => $count); } if (!empty($shortcode_types)) { $extracted['shortcode_types'] = $shortcode_types; } foreach ($shortcode_details as $type => $id) { $extracted['shortcode'][$type]['id'] = $id; } } } // Remove the shortcodes form our copy of $content, so we don't count links in them as links below. $content = preg_replace($shortcode_pattern, ' ', $content); } // ----------------------------------- LINKS ------------------------------ if (self::LINKS & $what_to_extract) { // To hold the extracted stuff we find $links = array(); // @todo Get the text inside the links? // Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images // (we treat embed links as just another link) if (preg_match_all('#(?:^|\\s|"|\')(https?://([^\\s()<>]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))))#', $content, $matches)) { foreach ($matches[1] as $link_raw) { $url = parse_url($link_raw); // Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those $simple_url = $url['scheme'] . '://' . $url['host'] . (!empty($url['path']) ? $url['path'] : ''); if (isset($extracted['image']['url'])) { if (in_array($simple_url, (array) $extracted['image']['url'])) { continue; } } list($proto, $link_all_but_proto) = explode('://', $link_raw); // Build a reversed hostname $host_parts = array_reverse(explode('.', $url['host'])); $host_reversed = ''; foreach ($host_parts as $part) { $host_reversed .= (!empty($host_reversed) ? '.' : '') . $part; } $link_analyzed = ''; if (!empty($url['path'])) { // The whole path (no query args or fragments) $path = substr($url['path'], 1); // strip the leading '/' $link_analyzed .= (!empty($link_analyzed) ? ' ' : '') . $path; // The path split by / $path_split = explode('/', $path); if (count($path_split) > 1) { $link_analyzed .= ' ' . implode(' ', $path_split); } // The fragment if (!empty($url['fragment'])) { $link_analyzed .= (!empty($link_analyzed) ? ' ' : '') . $url['fragment']; } } // @todo Check unique before adding $links[] = array('url' => $link_all_but_proto, 'host_reversed' => $host_reversed, 'host' => $url['host']); } } $link_count = count($links); $extracted['link'] = $links; if ($link_count) { if (!isset($extracted['has'])) { $extracted['has'] = array(); } $extracted['has']['link'] = $link_count; } } // ----------------------------------- EMBEDS ------------------------------ //Embeds are just individual links on their own line if (self::EMBEDS & $what_to_extract) { if (!function_exists('_wp_oembed_get_object')) { include ABSPATH . WPINC . '/class-oembed.php'; } // get an oembed object $oembed = _wp_oembed_get_object(); // Grab any links on their own lines that may be embeds if (preg_match_all('|^\\s*(https?://[^\\s"]+)\\s*$|im', $content, $matches)) { // To hold the extracted stuff we find $embeds = array(); foreach ($matches[1] as $link_raw) { $url = parse_url($link_raw); list($proto, $link_all_but_proto) = explode('://', $link_raw); // Check whether this "link" is really an embed. foreach ($oembed->providers as $matchmask => $data) { list($providerurl, $regex) = $data; // Turn the asterisk-type provider URLs into regex if (!$regex) { $matchmask = '#' . str_replace('___wildcard___', '(.+)', preg_quote(str_replace('*', '___wildcard___', $matchmask), '#')) . '#i'; $matchmask = preg_replace('|^#http\\\\://|', '#https?\\://', $matchmask); } if (preg_match($matchmask, $link_raw)) { $provider = str_replace('{format}', 'json', $providerurl); // JSON is easier to deal with than XML $embeds[] = $link_all_but_proto; // @todo Check unique before adding // @todo Try to get ID's for the ones we care about (shortcode_keepers) break; } } } if (!empty($embeds)) { if (!isset($extracted['has'])) { $extracted['has'] = array(); } $extracted['has']['embed'] = count($embeds); $extracted['embed'] = array('url' => array()); foreach ($embeds as $e) { $extracted['embed']['url'][] = $e; } } } } return $extracted; }
/** * @author scotchfield * @covers Jetpack_Media_Meta_Extractor::extract_images_from_content * @since 3.2 */ public function test_mediaextractor_extract_images_from_content_return_correct_image_struct() { $img_name = 'image.jpg'; $content = "<img src='{$img_name}'>"; $image_struct = Jetpack_Media_Meta_Extractor::extract_images_from_content($content, array()); $this->assertInternalType('array', $image_struct); $this->assertArrayHasKey('has', $image_struct); $this->assertArrayHasKey('image', $image_struct); $this->assertCount(1, $image_struct['image']); $this->assertEquals($image_struct['image'][0]['url'], $img_name); $this->assertEquals($image_struct['has']['image'], 1); }