/** * Index posts stored in $this->unindexedPosts * * @since 1.0 */ function index() { global $wp_filesystem, $searchwp; $this->check_for_parallel_indexer(); if (is_array($this->unindexedPosts) && count($this->unindexedPosts)) { do_action('searchwp_indexer_pre_chunk', $this->unindexedPosts); // all of the IDs to index have not been indexed, proceed with indexing them while (($unindexedPost = current($this->unindexedPosts)) !== false) { $this->setPost($unindexedPost); // log the attempt $count = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', true); if ($count == false) { $count = 0; } else { $count = intval($count); } $count++; // increment our counter to prevent the indexer getting stuck on a gigantic PDF update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', $count); do_action('searchwp_log', 'Attempt ' . $count . ' at indexing ' . $this->post->ID); // if we breached the maximum number of attempts, flag it to skip $this->maxAttemptsToIndex = absint(apply_filters('searchwp_max_index_attempts', $this->maxAttemptsToIndex)); if (intval($count) > $this->maxAttemptsToIndex) { do_action('searchwp_log', 'Too many indexing attempts on ' . $this->post->ID . ' (' . $this->maxAttemptsToIndex . ') - skipping'); // flag it to be skipped update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true); } else { // check to see if we're running a second pass on terms $termCache = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', true); if (!is_array($termCache)) { do_action('searchwp_index_post', $this->post); // if it's an attachment, we want the permalink $slug = $this->post->post_type == 'attachment' ? str_replace(get_bloginfo('wpurl'), '', get_permalink($this->post->ID)) : ''; // we allow users to override the extracted content from documents, if they have done so this flag is set $skipDocProcessing = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip_doc_processing', true); $omitDocProcessing = apply_filters('searchwp_omit_document_processing', false); if (!$skipDocProcessing && !$omitDocProcessing) { // if it's a PDF we need to populate our Custom Field with it's content if ($this->post->post_mime_type == 'application/pdf') { // grab the filename of the PDF $filename = get_attached_file($this->post->ID); // allow for external PDF content extraction $pdfContent = apply_filters('searchwp_external_pdf_processing', '', $filename, $this->post->ID); // only try to extract content if the external processing has not provided the PDF content we're looking for if (empty($pdfContent)) { // PdfParser runs only on 5.3+ but SearchWP runs on 5.2+ if (version_compare(PHP_VERSION, '5.3', '>=')) { include_once $searchwp->dir . '/vendor/pdfparser-bootloader.php'; } // a wrapper class was conditionally included if we're running PHP 5.3+ so let's try that if (class_exists('SearchWP_PdfParser')) { // try PdfParser first $parser = new SearchWP_PdfParser(); $parser = $parser->init(); $pdf = $parser->parseFile($filename); $text = $pdf->getText(); $pdfContent = trim(str_replace("\n", " ", $text)); } // try PDF2Text if (empty($pdfContent)) { if (!class_exists('PDF2Text')) { include_once $searchwp->dir . '/includes/class.pdf2text.php'; } $pdfParser = new PDF2Text(); $pdfParser->setFilename($filename); $pdfParser->decodePDF(); $pdfContent = $pdfParser->output(); $pdfContent = trim(str_replace("\n", " ", $pdfContent)); } // check to see if the first pass produced nothing or concatenated strings $fullContentLength = strlen($pdfContent); $numberOfSpaces = substr_count($pdfContent, ' '); if (empty($pdfContent) || $numberOfSpaces / $fullContentLength * 100 < 10) { WP_Filesystem(); $filecontent = $wp_filesystem->exists($filename) ? $wp_filesystem->get_contents($filename) : ''; if (false != strpos($filecontent, 'trailer')) { if (!class_exists('pdf_readstream')) { include_once $searchwp->dir . '/includes/class.pdfreadstream.php'; } $pdfContent = ''; $pdf = new pdf(get_attached_file($this->post->ID)); $pages = $pdf->get_pages(); if (!empty($pages)) { while (list($nr, $page) = each($pages)) { $pdfContent .= $page->get_text(); } } } else { // empty out the content so wacky concatenations are not indexed $pdfContent = ''; // flag it for further review update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'review', true); update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true); } } } $pdfContent = trim($pdfContent); if (!empty($pdfContent)) { $pdfContent = sanitize_text_field($pdfContent); delete_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content'); update_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content', $pdfContent); } } elseif ($this->post->post_mime_type == 'text/plain') { // if it's plain text, index it's content WP_Filesystem(); $filename = get_attached_file($this->post->ID); $textContent = $wp_filesystem->exists($filename) ? $wp_filesystem->get_contents($filename) : ''; $textContent = str_replace("\n", " ", $textContent); if (!empty($textContent)) { $textContent = sanitize_text_field($textContent); update_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content', $textContent); } } else { // all other file types } } $postTerms = array(); $postTerms['title'] = $this->indexTitle(); $postTerms['slug'] = $this->indexSlug(str_replace('/', ' ', $slug)); $postTerms['content'] = $this->indexContent(); $postTerms['excerpt'] = $this->indexExcerpt(); if (apply_filters('searchwp_index_comments', true)) { $postTerms['comments'] = $this->indexComments(); } // index taxonomies $taxonomies = get_object_taxonomies($this->post->post_type); if (!empty($taxonomies)) { while (($taxonomy = current($taxonomies)) !== false) { $terms = get_the_terms($this->post->ID, $taxonomy); if (!empty($terms)) { $postTerms['taxonomy'][$taxonomy] = $this->indexTaxonomyTerms($taxonomy, $terms); } next($taxonomies); } reset($taxonomies); } // index custom fields $customFields = apply_filters('searchwp_get_custom_fields', get_post_custom($this->post->ID), $this->post->ID); if (!empty($customFields)) { while (($customFieldValue = current($customFields)) !== false) { $customFieldName = key($customFields); // there are a few useless (when it comes to search) WordPress core custom fields, so let's exclude them by default $omitWpMetadata = apply_filters('searchwp_omit_wp_metadata', array('_edit_lock', '_wp_page_template', '_edit_last', '_wp_old_slug')); $excludedCustomFieldKeys = apply_filters('searchwp_excluded_custom_fields', array('_' . SEARCHWP_PREFIX . 'indexed', '_' . SEARCHWP_PREFIX . 'attempts', '_' . SEARCHWP_PREFIX . 'terms', '_' . SEARCHWP_PREFIX . 'last_index', '_' . SEARCHWP_PREFIX . 'skip', '_' . SEARCHWP_PREFIX . 'skip_doc_processing', '_' . SEARCHWP_PREFIX . 'review')); // merge the two arrays of keys if possible if (is_array($omitWpMetadata) && is_array($excludedCustomFieldKeys)) { $excluded_meta_keys = array_merge($omitWpMetadata, $excludedCustomFieldKeys); } elseif (is_array($omitWpMetadata)) { $excluded_meta_keys = $omitWpMetadata; } else { $excluded_meta_keys = $excludedCustomFieldKeys; } $excluded_meta_keys = is_array($excluded_meta_keys) ? array_unique($excluded_meta_keys) : array(); // allow developers to conditionally omit specific custom fields $omit_this_custom_field = apply_filters("searchwp_omit_meta_key", false, $customFieldName, $this->post); $omit_this_custom_field = apply_filters("searchwp_omit_meta_key_{$customFieldName}", $omit_this_custom_field, $this->post); if (!in_array($customFieldName, $excluded_meta_keys) && !$omit_this_custom_field) { // allow devs to swap out their own content // e.g. parsing ACF Relationship fields (that store only post IDs) to actually retrieve that content at runtime $customFieldValue = apply_filters('searchwp_custom_fields', $customFieldValue, $customFieldName, $this->post); $customFieldValue = apply_filters("searchwp_custom_field_{$customFieldName}", $customFieldValue, $this->post); $postTerms['customfield'][$customFieldName] = $this->indexCustomField($customFieldName, $customFieldValue); } next($customFields); } reset($customFields); } // allow developer to store arbitrary information a la Custom Fields (without them actually being Custom Fields) $extraMetadata = apply_filters("searchwp_extra_metadata", false, $this->post); if ($extraMetadata) { if (is_array($extraMetadata)) { foreach ($extraMetadata as $extraMetadataKey => $extraMetadataValue) { // TODO: make sure there are no collisions? // while( isset( $postTerms['customfield'][$extraMetadataKey] ) ) { // $extraMetadataKey .= '_'; // } $postTerms['customfield'][$extraMetadataKey] = $this->indexCustomField($extraMetadataKey, $extraMetadataValue); } } } // we need to break out the terms from all of this content $termCountBreakout = array(); if (is_array($postTerms) && count($postTerms)) { foreach ($postTerms as $type => $terms) { switch ($type) { case 'title': case 'slug': case 'content': case 'excerpt': case 'comments': if (is_array($terms) && count($terms)) { foreach ($terms as $term) { $termCountBreakout[$term['term']][$type] = $term['count']; } } break; case 'taxonomy': case 'customfield': if (is_array($terms) && count($terms)) { foreach ($terms as $name => $nameTerms) { if (is_array($nameTerms) && count($nameTerms)) { foreach ($nameTerms as $nameTerm) { $termCountBreakout[$nameTerm['term']][$type][$name] = $nameTerm['count']; } } } } break; } } } } else { $termCountBreakout = $termCache; // if there was a term cache, this repeated processing doesn't count, so decrement it delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts'); delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip'); } // unless the term chunk limit says otherwise, we're going to flag this as being OK to log as indexed $flagAsIndexed = true; // we now have a multidimensional array of terms with counts per type in $termCountBreakout // if the term count is huge, we need to split up this process so as to avoid // hitting upper PHP execution time limits (term insertion is heavy), so we'll chunk the array of terms $termChunkMax = 500; // try to set a better default based on php.ini's memory_limit $memoryLimit = ini_get('memory_limit'); if (preg_match('/^(\\d+)(.)$/', $memoryLimit, $matches)) { if ($matches[2] == 'M') { $termChunkMax = (int) $matches[1] * 15; // 15 terms per MB RAM } else { // memory was set in K... $termChunkMax = 100; } } $termChunkLimit = apply_filters('searchwp_process_term_limit', $termChunkMax); if (count($termCountBreakout) > $termChunkLimit) { $acceptableTermCountBreakout = array_slice($termCountBreakout, 0, $termChunkLimit); // if we haven't pulled all of the terms, we can't consider this post indexed... if ($termChunkLimit < count($termCountBreakout) - 1) { $flagAsIndexed = false; // save the term breakout so we don't have to do it again $remainingTerms = array_slice($termCountBreakout, $termChunkLimit + 1); update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', $remainingTerms); } // set the acceptable breakout as the main breakout $termCountBreakout = $acceptableTermCountBreakout; } $this->recordPostTerms($termCountBreakout); unset($termCountBreakout); // flag the post as indexed if ($flagAsIndexed) { // clean up our stored term array if necessary if ($termCache) { delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms'); } // clean up the attempt counter delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts'); delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip'); update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'indexed', true); update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'last_index', current_time('timestamp')); } } next($this->unindexedPosts); } reset($this->unindexedPosts); do_action('searchwp_indexer_post_chunk'); } }
/** * Extract plain text from PDF * * @since 2.5 * @param $post_id integer The post ID of the PDF in the Media library * * @return string The contents of the PDF */ function extract_pdf_text( $post_id ) { global $wp_filesystem, $searchwp; $pdf_post = get_post( absint( $post_id ) ); // make sure it's a PDF if ( 'application/pdf' !== $pdf_post->post_mime_type ) { return ''; } // grab the filename of the PDF $filename = get_attached_file( absint( $post_id ) ); // make sure the file exists locally if ( ! file_exists( $filename ) ) { return ''; } // PdfParser runs only on 5.3+ but SearchWP runs on 5.2+ if ( version_compare( PHP_VERSION, '5.3', '>=' ) ) { /** @noinspection PhpIncludeInspection */ include_once( $searchwp->dir . '/vendor/pdfparser-bootloader.php' ); // a wrapper class was conditionally included if we're running PHP 5.3+ so let's try that if ( class_exists( 'SearchWP_PdfParser' ) ) { /** @noinspection PhpIncludeInspection */ include_once( $searchwp->dir . '/vendor/pdfparser/vendor/autoload.php' ); // try PdfParser first $parser = new SearchWP_PdfParser(); $parser = $parser->init(); try { $pdf = $parser->parseFile( $filename ); $pdfContent = $pdf->getText(); } catch (Exception $e) { do_action( 'searchwp_log', 'PDF parsing failed: ' . $e->getMessage() ); return false; } } } // try PDF2Text if ( empty( $pdfContent ) ) { if ( ! class_exists( 'PDF2Text' ) ) { /** @noinspection PhpIncludeInspection */ include_once( $searchwp->dir . '/vendor/class.pdf2text.php' ); } $pdfParser = new PDF2Text(); $pdfParser->setFilename( $filename ); $pdfParser->decodePDF(); $pdfContent = $pdfParser->output(); $pdfContent = trim( str_replace( "\n", ' ', $pdfContent ) ); } // check to see if the first pass produced nothing or concatenated strings $fullContentLength = strlen( $pdfContent ); $numberOfSpaces = substr_count( $pdfContent, ' ' ); if ( empty( $pdfContent ) || ( ( $numberOfSpaces / $fullContentLength ) * 100 < 10 ) ) { WP_Filesystem(); if ( method_exists( $wp_filesystem, 'exists' ) && method_exists( $wp_filesystem, 'get_contents' ) ) { $filecontent = $wp_filesystem->exists( $filename ) ? $wp_filesystem->get_contents( $filename ) : ''; } else { $filecontent = ''; } if ( false != strpos( $filecontent, 'trailer' ) ) { if ( ! class_exists( 'pdf_readstream' ) ) { /** @noinspection PhpIncludeInspection */ include_once( $searchwp->dir . '/vendor/class.pdfreadstream.php' ); } $pdfContent = ''; $pdf = new pdf( get_attached_file( $this->post->ID ) ); $pages = $pdf->get_pages(); if ( ! empty( $pages ) ) { /** @noinspection PhpUnusedLocalVariableInspection */ while ( list( $nr, $page ) = each( $pages ) ) { if ( method_exists( $page, 'get_text' ) ) { $pdfContent .= $page->get_text(); } } } } else { // empty out the content so wacky concatenations are not indexed $pdfContent = false; } } return $pdfContent; }