Example #1
0
 /**
  * Index posts stored in $this->unindexedPosts
  *
  * @since 1.0
  */
 function index()
 {
     global $wp_filesystem, $searchwp;
     $this->check_for_parallel_indexer();
     if (is_array($this->unindexedPosts) && count($this->unindexedPosts)) {
         do_action('searchwp_indexer_pre_chunk', $this->unindexedPosts);
         // all of the IDs to index have not been indexed, proceed with indexing them
         while (($unindexedPost = current($this->unindexedPosts)) !== false) {
             $this->setPost($unindexedPost);
             // log the attempt
             $count = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', true);
             if ($count == false) {
                 $count = 0;
             } else {
                 $count = intval($count);
             }
             $count++;
             // increment our counter to prevent the indexer getting stuck on a gigantic PDF
             update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts', $count);
             do_action('searchwp_log', 'Attempt ' . $count . ' at indexing ' . $this->post->ID);
             // if we breached the maximum number of attempts, flag it to skip
             $this->maxAttemptsToIndex = absint(apply_filters('searchwp_max_index_attempts', $this->maxAttemptsToIndex));
             if (intval($count) > $this->maxAttemptsToIndex) {
                 do_action('searchwp_log', 'Too many indexing attempts on ' . $this->post->ID . ' (' . $this->maxAttemptsToIndex . ') - skipping');
                 // flag it to be skipped
                 update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true);
             } else {
                 // check to see if we're running a second pass on terms
                 $termCache = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', true);
                 if (!is_array($termCache)) {
                     do_action('searchwp_index_post', $this->post);
                     // if it's an attachment, we want the permalink
                     $slug = $this->post->post_type == 'attachment' ? str_replace(get_bloginfo('wpurl'), '', get_permalink($this->post->ID)) : '';
                     // we allow users to override the extracted content from documents, if they have done so this flag is set
                     $skipDocProcessing = get_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip_doc_processing', true);
                     $omitDocProcessing = apply_filters('searchwp_omit_document_processing', false);
                     if (!$skipDocProcessing && !$omitDocProcessing) {
                         // if it's a PDF we need to populate our Custom Field with it's content
                         if ($this->post->post_mime_type == 'application/pdf') {
                             // grab the filename of the PDF
                             $filename = get_attached_file($this->post->ID);
                             // allow for external PDF content extraction
                             $pdfContent = apply_filters('searchwp_external_pdf_processing', '', $filename, $this->post->ID);
                             // only try to extract content if the external processing has not provided the PDF content we're looking for
                             if (empty($pdfContent)) {
                                 // PdfParser runs only on 5.3+ but SearchWP runs on 5.2+
                                 if (version_compare(PHP_VERSION, '5.3', '>=')) {
                                     include_once $searchwp->dir . '/vendor/pdfparser-bootloader.php';
                                 }
                                 // a wrapper class was conditionally included if we're running PHP 5.3+ so let's try that
                                 if (class_exists('SearchWP_PdfParser')) {
                                     // try PdfParser first
                                     $parser = new SearchWP_PdfParser();
                                     $parser = $parser->init();
                                     $pdf = $parser->parseFile($filename);
                                     $text = $pdf->getText();
                                     $pdfContent = trim(str_replace("\n", " ", $text));
                                 }
                                 // try PDF2Text
                                 if (empty($pdfContent)) {
                                     if (!class_exists('PDF2Text')) {
                                         include_once $searchwp->dir . '/includes/class.pdf2text.php';
                                     }
                                     $pdfParser = new PDF2Text();
                                     $pdfParser->setFilename($filename);
                                     $pdfParser->decodePDF();
                                     $pdfContent = $pdfParser->output();
                                     $pdfContent = trim(str_replace("\n", " ", $pdfContent));
                                 }
                                 // check to see if the first pass produced nothing or concatenated strings
                                 $fullContentLength = strlen($pdfContent);
                                 $numberOfSpaces = substr_count($pdfContent, ' ');
                                 if (empty($pdfContent) || $numberOfSpaces / $fullContentLength * 100 < 10) {
                                     WP_Filesystem();
                                     $filecontent = $wp_filesystem->exists($filename) ? $wp_filesystem->get_contents($filename) : '';
                                     if (false != strpos($filecontent, 'trailer')) {
                                         if (!class_exists('pdf_readstream')) {
                                             include_once $searchwp->dir . '/includes/class.pdfreadstream.php';
                                         }
                                         $pdfContent = '';
                                         $pdf = new pdf(get_attached_file($this->post->ID));
                                         $pages = $pdf->get_pages();
                                         if (!empty($pages)) {
                                             while (list($nr, $page) = each($pages)) {
                                                 $pdfContent .= $page->get_text();
                                             }
                                         }
                                     } else {
                                         // empty out the content so wacky concatenations are not indexed
                                         $pdfContent = '';
                                         // flag it for further review
                                         update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'review', true);
                                         update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip', true);
                                     }
                                 }
                             }
                             $pdfContent = trim($pdfContent);
                             if (!empty($pdfContent)) {
                                 $pdfContent = sanitize_text_field($pdfContent);
                                 delete_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content');
                                 update_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content', $pdfContent);
                             }
                         } elseif ($this->post->post_mime_type == 'text/plain') {
                             // if it's plain text, index it's content
                             WP_Filesystem();
                             $filename = get_attached_file($this->post->ID);
                             $textContent = $wp_filesystem->exists($filename) ? $wp_filesystem->get_contents($filename) : '';
                             $textContent = str_replace("\n", " ", $textContent);
                             if (!empty($textContent)) {
                                 $textContent = sanitize_text_field($textContent);
                                 update_post_meta($this->post->ID, SEARCHWP_PREFIX . 'content', $textContent);
                             }
                         } else {
                             // all other file types
                         }
                     }
                     $postTerms = array();
                     $postTerms['title'] = $this->indexTitle();
                     $postTerms['slug'] = $this->indexSlug(str_replace('/', ' ', $slug));
                     $postTerms['content'] = $this->indexContent();
                     $postTerms['excerpt'] = $this->indexExcerpt();
                     if (apply_filters('searchwp_index_comments', true)) {
                         $postTerms['comments'] = $this->indexComments();
                     }
                     // index taxonomies
                     $taxonomies = get_object_taxonomies($this->post->post_type);
                     if (!empty($taxonomies)) {
                         while (($taxonomy = current($taxonomies)) !== false) {
                             $terms = get_the_terms($this->post->ID, $taxonomy);
                             if (!empty($terms)) {
                                 $postTerms['taxonomy'][$taxonomy] = $this->indexTaxonomyTerms($taxonomy, $terms);
                             }
                             next($taxonomies);
                         }
                         reset($taxonomies);
                     }
                     // index custom fields
                     $customFields = apply_filters('searchwp_get_custom_fields', get_post_custom($this->post->ID), $this->post->ID);
                     if (!empty($customFields)) {
                         while (($customFieldValue = current($customFields)) !== false) {
                             $customFieldName = key($customFields);
                             // there are a few useless (when it comes to search) WordPress core custom fields, so let's exclude them by default
                             $omitWpMetadata = apply_filters('searchwp_omit_wp_metadata', array('_edit_lock', '_wp_page_template', '_edit_last', '_wp_old_slug'));
                             $excludedCustomFieldKeys = apply_filters('searchwp_excluded_custom_fields', array('_' . SEARCHWP_PREFIX . 'indexed', '_' . SEARCHWP_PREFIX . 'attempts', '_' . SEARCHWP_PREFIX . 'terms', '_' . SEARCHWP_PREFIX . 'last_index', '_' . SEARCHWP_PREFIX . 'skip', '_' . SEARCHWP_PREFIX . 'skip_doc_processing', '_' . SEARCHWP_PREFIX . 'review'));
                             // merge the two arrays of keys if possible
                             if (is_array($omitWpMetadata) && is_array($excludedCustomFieldKeys)) {
                                 $excluded_meta_keys = array_merge($omitWpMetadata, $excludedCustomFieldKeys);
                             } elseif (is_array($omitWpMetadata)) {
                                 $excluded_meta_keys = $omitWpMetadata;
                             } else {
                                 $excluded_meta_keys = $excludedCustomFieldKeys;
                             }
                             $excluded_meta_keys = is_array($excluded_meta_keys) ? array_unique($excluded_meta_keys) : array();
                             // allow developers to conditionally omit specific custom fields
                             $omit_this_custom_field = apply_filters("searchwp_omit_meta_key", false, $customFieldName, $this->post);
                             $omit_this_custom_field = apply_filters("searchwp_omit_meta_key_{$customFieldName}", $omit_this_custom_field, $this->post);
                             if (!in_array($customFieldName, $excluded_meta_keys) && !$omit_this_custom_field) {
                                 // allow devs to swap out their own content
                                 // e.g. parsing ACF Relationship fields (that store only post IDs) to actually retrieve that content at runtime
                                 $customFieldValue = apply_filters('searchwp_custom_fields', $customFieldValue, $customFieldName, $this->post);
                                 $customFieldValue = apply_filters("searchwp_custom_field_{$customFieldName}", $customFieldValue, $this->post);
                                 $postTerms['customfield'][$customFieldName] = $this->indexCustomField($customFieldName, $customFieldValue);
                             }
                             next($customFields);
                         }
                         reset($customFields);
                     }
                     // allow developer to store arbitrary information a la Custom Fields (without them actually being Custom Fields)
                     $extraMetadata = apply_filters("searchwp_extra_metadata", false, $this->post);
                     if ($extraMetadata) {
                         if (is_array($extraMetadata)) {
                             foreach ($extraMetadata as $extraMetadataKey => $extraMetadataValue) {
                                 // TODO: make sure there are no collisions?
                                 // while( isset( $postTerms['customfield'][$extraMetadataKey] ) ) {
                                 //    $extraMetadataKey .= '_';
                                 // }
                                 $postTerms['customfield'][$extraMetadataKey] = $this->indexCustomField($extraMetadataKey, $extraMetadataValue);
                             }
                         }
                     }
                     // we need to break out the terms from all of this content
                     $termCountBreakout = array();
                     if (is_array($postTerms) && count($postTerms)) {
                         foreach ($postTerms as $type => $terms) {
                             switch ($type) {
                                 case 'title':
                                 case 'slug':
                                 case 'content':
                                 case 'excerpt':
                                 case 'comments':
                                     if (is_array($terms) && count($terms)) {
                                         foreach ($terms as $term) {
                                             $termCountBreakout[$term['term']][$type] = $term['count'];
                                         }
                                     }
                                     break;
                                 case 'taxonomy':
                                 case 'customfield':
                                     if (is_array($terms) && count($terms)) {
                                         foreach ($terms as $name => $nameTerms) {
                                             if (is_array($nameTerms) && count($nameTerms)) {
                                                 foreach ($nameTerms as $nameTerm) {
                                                     $termCountBreakout[$nameTerm['term']][$type][$name] = $nameTerm['count'];
                                                 }
                                             }
                                         }
                                     }
                                     break;
                             }
                         }
                     }
                 } else {
                     $termCountBreakout = $termCache;
                     // if there was a term cache, this repeated processing doesn't count, so decrement it
                     delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts');
                     delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip');
                 }
                 // unless the term chunk limit says otherwise, we're going to flag this as being OK to log as indexed
                 $flagAsIndexed = true;
                 // we now have a multidimensional array of terms with counts per type in $termCountBreakout
                 // if the term count is huge, we need to split up this process so as to avoid
                 // hitting upper PHP execution time limits (term insertion is heavy), so we'll chunk the array of terms
                 $termChunkMax = 500;
                 // try to set a better default based on php.ini's memory_limit
                 $memoryLimit = ini_get('memory_limit');
                 if (preg_match('/^(\\d+)(.)$/', $memoryLimit, $matches)) {
                     if ($matches[2] == 'M') {
                         $termChunkMax = (int) $matches[1] * 15;
                         // 15 terms per MB RAM
                     } else {
                         // memory was set in K...
                         $termChunkMax = 100;
                     }
                 }
                 $termChunkLimit = apply_filters('searchwp_process_term_limit', $termChunkMax);
                 if (count($termCountBreakout) > $termChunkLimit) {
                     $acceptableTermCountBreakout = array_slice($termCountBreakout, 0, $termChunkLimit);
                     // if we haven't pulled all of the terms, we can't consider this post indexed...
                     if ($termChunkLimit < count($termCountBreakout) - 1) {
                         $flagAsIndexed = false;
                         // save the term breakout so we don't have to do it again
                         $remainingTerms = array_slice($termCountBreakout, $termChunkLimit + 1);
                         update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms', $remainingTerms);
                     }
                     // set the acceptable breakout as the main breakout
                     $termCountBreakout = $acceptableTermCountBreakout;
                 }
                 $this->recordPostTerms($termCountBreakout);
                 unset($termCountBreakout);
                 // flag the post as indexed
                 if ($flagAsIndexed) {
                     // clean up our stored term array if necessary
                     if ($termCache) {
                         delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'terms');
                     }
                     // clean up the attempt counter
                     delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'attempts');
                     delete_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'skip');
                     update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'indexed', true);
                     update_post_meta($this->post->ID, '_' . SEARCHWP_PREFIX . 'last_index', current_time('timestamp'));
                 }
             }
             next($this->unindexedPosts);
         }
         reset($this->unindexedPosts);
         do_action('searchwp_indexer_post_chunk');
     }
 }
	/**
	 * Extract plain text from PDF
	 *
	 * @since 2.5
	 * @param $post_id integer The post ID of the PDF in the Media library
	 *
	 * @return string The contents of the PDF
	 */
	function extract_pdf_text( $post_id ) {
		global $wp_filesystem, $searchwp;

		$pdf_post = get_post( absint( $post_id ) );

		// make sure it's a PDF
		if ( 'application/pdf' !== $pdf_post->post_mime_type ) {
			return '';
		}

		// grab the filename of the PDF
		$filename = get_attached_file( absint( $post_id ) );

		// make sure the file exists locally
		if ( ! file_exists( $filename ) ) {
			return '';
		}

		// PdfParser runs only on 5.3+ but SearchWP runs on 5.2+
		if ( version_compare( PHP_VERSION, '5.3', '>=' ) ) {

			/** @noinspection PhpIncludeInspection */
			include_once( $searchwp->dir . '/vendor/pdfparser-bootloader.php' );

			// a wrapper class was conditionally included if we're running PHP 5.3+ so let's try that
			if ( class_exists( 'SearchWP_PdfParser' ) ) {

				/** @noinspection PhpIncludeInspection */
				include_once( $searchwp->dir . '/vendor/pdfparser/vendor/autoload.php' );

				// try PdfParser first
				$parser = new SearchWP_PdfParser();
				$parser = $parser->init();
				try {
					$pdf = $parser->parseFile( $filename );
					$pdfContent = $pdf->getText();
				} catch (Exception $e) {
					do_action( 'searchwp_log', 'PDF parsing failed: ' . $e->getMessage() );
					return false;
				}
			}
		}

		// try PDF2Text
		if ( empty( $pdfContent ) ) {
			if ( ! class_exists( 'PDF2Text' ) ) {
				/** @noinspection PhpIncludeInspection */
				include_once( $searchwp->dir . '/vendor/class.pdf2text.php' );
			}
			$pdfParser = new PDF2Text();
			$pdfParser->setFilename( $filename );
			$pdfParser->decodePDF();
			$pdfContent = $pdfParser->output();
			$pdfContent = trim( str_replace( "\n", ' ', $pdfContent ) );
		}

		// check to see if the first pass produced nothing or concatenated strings
		$fullContentLength = strlen( $pdfContent );
		$numberOfSpaces = substr_count( $pdfContent, ' ' );
		if ( empty( $pdfContent ) || ( ( $numberOfSpaces / $fullContentLength ) * 100 < 10 ) ) {
			WP_Filesystem();

			if ( method_exists( $wp_filesystem, 'exists' ) && method_exists( $wp_filesystem, 'get_contents' ) ) {
				$filecontent = $wp_filesystem->exists( $filename ) ? $wp_filesystem->get_contents( $filename ) : '';
			} else {
				$filecontent = '';
			}

			if ( false != strpos( $filecontent, 'trailer' ) ) {
				if ( ! class_exists( 'pdf_readstream' ) ) {
					/** @noinspection PhpIncludeInspection */
					include_once( $searchwp->dir . '/vendor/class.pdfreadstream.php' );
				}
				$pdfContent = '';
				$pdf = new pdf( get_attached_file( $this->post->ID ) );
				$pages = $pdf->get_pages();
				if ( ! empty( $pages ) ) {
					/** @noinspection PhpUnusedLocalVariableInspection */
					while ( list( $nr, $page ) = each( $pages ) ) {
						if ( method_exists( $page, 'get_text' ) ) {
							$pdfContent .= $page->get_text();
						}
					}
				}
			} else {
				// empty out the content so wacky concatenations are not indexed
				$pdfContent = false;
			}
		}

		return $pdfContent;
	}