/** * Tokenizes an HTML string according to the HTML elements. * * Assigns boost values to the elements' contents accordingly. * * @param string $text * The HTML string to parse, passed by reference. After the method call, the * variable will contain the portion of the string after the current * element, or an empty string (if there is no current element). * @param string|null $active_tag * (optional) The currently active tag, for which a closing tag has to be * found. Internal use only. * @param float $boost * (optional) The currently active boost value. Internal use only. * * @return array * Tokenized text with appropriate scores. */ protected function parseHtml(&$text, $active_tag = NULL, $boost = 1.0) { $ret = array(); while (($pos = strpos($text, '<')) !== FALSE) { if ($boost && $pos > 0) { $value = $this->normalizeText(substr($text, 0, $pos)); if ($value !== '') { $ret[] = Utility::createTextToken($value, $boost); } } $text = substr($text, $pos + 1); preg_match('#^(/?)([-:_a-zA-Z0-9]+)#', $text, $m); $pos = strpos($text, '>'); $empty_tag = $text[$pos - 1] == '/'; $text = substr($text, $pos + 1); if ($m[1]) { // Closing tag. if ($active_tag && $m[2] == $active_tag) { return $ret; } } elseif (!$empty_tag) { // Opening tag => recursive call. $inner_boost = $boost * (isset($this->configuration['tags'][$m[2]]) ? $this->configuration['tags'][$m[2]] : 1); $ret = array_merge($ret, $this->parseHtml($text, $m[2], $inner_boost)); } } if ($text) { $value = $this->normalizeText($text); if ($value !== '') { $ret[] = Utility::createTextToken($value, $boost); } $text = ''; } return $ret; }
/** * Provides test data for testValueConfiguration(). * * @return array * Arrays of parameters for testProcessFieldValue(), each containing (in * this order): * - The field value passed to the processor's processFieldValue() method. * - The expected preprocessed value. * - (optional) Configuration to override the processor's defaults. */ public function textDataProvider() { $word_token = Utility::createTextToken('word'); return array(array('word', array($word_token)), array('word word', array($word_token, $word_token)), array('words!word', array(Utility::createTextToken('words'), $word_token)), array('words$word', array(Utility::createTextToken('words'), $word_token)), array('wordXwordxword', array($word_token, Utility::createTextToken('wordxword')), array('spaces' => 'X')), array('word3word!word', array($word_token, Utility::createTextToken('word!word')), array('spaces' => '\\d')), array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')), array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-TW-Z')), array('wordXword word', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')), array('wordSwo', array($word_token), array('spaces' => 'R-Z')), array('wordSwo', array($word_token, Utility::createTextToken('wo')), array('spaces' => 'R-Z', 'minimum_word_size' => 2)), array('word w', array($word_token), array('minimum_word_size' => 2)), array('word w', array($word_token, Utility::createTextToken('w')), array('minimum_word_size' => 1)), array('word wordword', array(), array('minimum_word_size' => 10))); }
/** * {@inheritdoc} */ protected function processFieldValue(&$value, &$type) { $this->prepare(); $type = 'tokenized_text'; $text = $this->simplifyText($value); // Split on spaces. The configured (or default) delimiters have been // replaced by those already in simplifyText(). $arr = explode(' ', $text); $value = array(); foreach ($arr as $token) { if (is_numeric($token) || Unicode::strlen($token) >= $this->configuration['minimum_word_size']) { $value[] = Utility::createTextToken($token); } } }
/** * Data provider method for testTagConfiguration(). * * @return array * An array of argument arrays for testTagConfiguration(). */ public function tagConfigurationDataProvider() { $complex_test = array('<h2>Foo Bar <em>Baz</em></h2> <p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p> <img src="/foo.png" alt="Some picture" /> <span>This is hidden</span>', array(Utility::createTextToken('Foo Bar', 3.0), Utility::createTextToken('Baz', 4.5), Utility::createTextToken('Bla Bla Bla.', 1.0), Utility::createTextToken('Foobar Important:', 2.0), Utility::createTextToken('Bla.', 1.0), Utility::createTextToken('Some picture', 0.5)), array('em' => 1.5, 'strong' => 2.0, 'h2' => 3.0, 'img' => 0.5, 'span' => 0)); $tags_config = array('h2' => '2'); return array(array('h2word', 'h2word', array()), array('h2word', array(Utility::createTextToken('h2word')), $tags_config), array('foo bar <h2> h2word </h2>', array(Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0)), $tags_config), array('foo bar <h2>h2word</h2>', array(Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0)), $tags_config), array('<div>word</div>', array(Utility::createTextToken('word', 2)), array('div' => 2)), $complex_test); }