示例#1
0
 /**
  * Tokenizes an HTML string according to the HTML elements.
  *
  * Assigns boost values to the elements' contents accordingly.
  *
  * @param string $text
  *   The HTML string to parse, passed by reference. After the method call, the
  *   variable will contain the portion of the string after the current
  *   element, or an empty string (if there is no current element).
  * @param string|null $active_tag
  *   (optional) The currently active tag, for which a closing tag has to be
  *   found. Internal use only.
  * @param float $boost
  *   (optional) The currently active boost value. Internal use only.
  *
  * @return array
  *   Tokenized text with appropriate scores.
  */
 protected function parseHtml(&$text, $active_tag = NULL, $boost = 1.0) {
   $ret = array();
   while (($pos = strpos($text, '<')) !== FALSE) {
     if ($boost && $pos > 0) {
       $value = $this->normalizeText(substr($text, 0, $pos));
       if ($value !== '') {
         $ret[] = Utility::createTextToken($value, $boost);
       }
     }
     $text = substr($text, $pos + 1);
     preg_match('#^(/?)([-:_a-zA-Z0-9]+)#', $text, $m);
     $pos = strpos($text, '>');
     $empty_tag = $text[$pos - 1] == '/';
     $text = substr($text, $pos + 1);
     if ($m[1]) {
       // Closing tag.
       if ($active_tag && $m[2] == $active_tag) {
         return $ret;
       }
     }
     elseif (!$empty_tag) {
       // Opening tag => recursive call.
       $inner_boost = $boost * (isset($this->configuration['tags'][$m[2]]) ? $this->configuration['tags'][$m[2]] : 1);
       $ret = array_merge($ret, $this->parseHtml($text, $m[2], $inner_boost));
     }
   }
   if ($text) {
     $value = $this->normalizeText($text);
     if ($value !== '') {
       $ret[] = Utility::createTextToken($value, $boost);
     }
     $text = '';
   }
   return $ret;
 }
示例#2
0
 /**
  * Provides test data for testValueConfiguration().
  *
  * @return array
  *   Arrays of parameters for testProcessFieldValue(), each containing (in
  *   this order):
  *   - The field value passed to the processor's processFieldValue() method.
  *   - The expected preprocessed value.
  *   - (optional) Configuration to override the processor's defaults.
  */
 public function textDataProvider()
 {
     $word_token = Utility::createTextToken('word');
     return array(array('word', array($word_token)), array('word word', array($word_token, $word_token)), array('words!word', array(Utility::createTextToken('words'), $word_token)), array('words$word', array(Utility::createTextToken('words'), $word_token)), array('wordXwordxword', array($word_token, Utility::createTextToken('wordxword')), array('spaces' => 'X')), array('word3word!word', array($word_token, Utility::createTextToken('word!word')), array('spaces' => '\\d')), array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')), array('wordXwordRword', array($word_token, $word_token, $word_token), array('spaces' => 'R-TW-Z')), array('wordXword word', array($word_token, $word_token, $word_token), array('spaces' => 'R-Z')), array('wordSwo', array($word_token), array('spaces' => 'R-Z')), array('wordSwo', array($word_token, Utility::createTextToken('wo')), array('spaces' => 'R-Z', 'minimum_word_size' => 2)), array('word w', array($word_token), array('minimum_word_size' => 2)), array('word w', array($word_token, Utility::createTextToken('w')), array('minimum_word_size' => 1)), array('word wordword', array(), array('minimum_word_size' => 10)));
 }
 /**
  * {@inheritdoc}
  */
 protected function processFieldValue(&$value, &$type)
 {
     $this->prepare();
     $type = 'tokenized_text';
     $text = $this->simplifyText($value);
     // Split on spaces. The configured (or default) delimiters have been
     // replaced by those already in simplifyText().
     $arr = explode(' ', $text);
     $value = array();
     foreach ($arr as $token) {
         if (is_numeric($token) || Unicode::strlen($token) >= $this->configuration['minimum_word_size']) {
             $value[] = Utility::createTextToken($token);
         }
     }
 }
示例#4
0
    /**
     * Data provider method for testTagConfiguration().
     *
     * @return array
     *   An array of argument arrays for testTagConfiguration().
     */
    public function tagConfigurationDataProvider()
    {
        $complex_test = array('<h2>Foo Bar <em>Baz</em></h2>

<p>Bla Bla Bla. <strong title="Foobar">Important:</strong> Bla.</p>
<img src="/foo.png" alt="Some picture" />
<span>This is hidden</span>', array(Utility::createTextToken('Foo Bar', 3.0), Utility::createTextToken('Baz', 4.5), Utility::createTextToken('Bla Bla Bla.', 1.0), Utility::createTextToken('Foobar Important:', 2.0), Utility::createTextToken('Bla.', 1.0), Utility::createTextToken('Some picture', 0.5)), array('em' => 1.5, 'strong' => 2.0, 'h2' => 3.0, 'img' => 0.5, 'span' => 0));
        $tags_config = array('h2' => '2');
        return array(array('h2word', 'h2word', array()), array('h2word', array(Utility::createTextToken('h2word')), $tags_config), array('foo bar <h2> h2word </h2>', array(Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0)), $tags_config), array('foo bar <h2>h2word</h2>', array(Utility::createTextToken('foo bar'), Utility::createTextToken('h2word', 2.0)), $tags_config), array('<div>word</div>', array(Utility::createTextToken('word', 2)), array('div' => 2)), $complex_test);
    }