Example #1
0
function stemMany($words)
{
    $stemmed = array();
    for ($i = 0; $i < sizeof($words); $i++) {
        $stemmed[$i] = stem($words[$i]);
    }
    return $stemmed;
}
Example #2
0
 protected function updateKeywords()
 {
     $lyrics = preg_replace(array('/\\s+/', '/[^a-z ]/'), array(' ', ''), strtolower($this->getLyrics()));
     $words = explode(' ', $lyrics);
     $stemmed_words = array();
     foreach ($words as $w) {
         if (trim($w) == '') {
             continue;
         }
         $stemmed_words[] = stem($w, STEM_ENGLISH);
     }
     // flipping twice to get uniques
     $words = array_flip($stemmed_words);
     $this->data['_keywords'] = array_values(array_flip($words));
 }
Example #3
0
 /**
  * Gets the word in the thesaurus that is most similar to the passed word. Uses extension php_stem for stemming if it is available (highly recommended).
  *
  * @param string $word
  * @return array The array of synonyms, array position 0 is the matched word 
  */
 public static function GetSynonyms($word)
 {
     //get the thesaurus
     $thesaurus_array = file(dirname(__FILE__) . "/thesaurus_files/moby_thesaurus.txt");
     //get the stemmed word, requires the PECL extension php_stem
     if (function_exists("stem")) {
         $stemmed_word = stem($word);
     } else {
         //can't get the stemmed word
         $stemmed_word = $word;
     }
     //the array of potential entries
     $potential_entries = array();
     //loop through the thesaurus entries
     foreach ($thesaurus_array as $entry) {
         if (MobyThesaurus::StartsWith($stemmed_word, $entry)) {
             $entry_arr = split(",", $entry);
             if ($entry_arr[0] == $word) {
                 return $entry_arr;
             } else {
                 array_push($potential_entries, $entry_arr);
             }
         }
     }
     //anything above 10 is way too far away
     $lowest_distance = 10;
     foreach ($potential_entries as $entry) {
         $distance = levenshtein($entry[0], $word);
         //keep only the word that is closest to the original word
         if ($distance < $lowest_distance) {
             $lowest_distance = $distance;
             $best_entry = $entry;
         }
     }
     if (isset($best_entry)) {
         return $best_entry;
     } else {
         return array();
     }
 }
Example #4
0
    public function __construct($rid, $existing, $opts = array())
    {
        $opts = array_merge(array('min_len' => 4, 'count' => 20), $opts);
        require_once PATH_CORE . DS . 'components' . DS . 'com_resources' . DS . 'helpers' . DS . 'Inflect.php';
        $dbh = App::get('db');
        $dbh->setQuery('SELECT t.raw_tag, fa.*
			FROM #__focus_areas fa
			INNER JOIN #__tags t ON t.id = fa.tag_id');
        $this->fa_properties = $dbh->loadAssocList('raw_tag');
        $dbh->setQuery('SELECT raw_tag, (label IS NOT NULL AND label != "") AS is_focus_area
			FROM #__tags_object to1
			INNER JOIN #__tags t ON t.id = to1.tagid
			WHERE to1.tbl = \'resources\' AND to1.objectid = ' . $rid);
        if (!$existing) {
            foreach ($dbh->loadAssocList() as $tag) {
                if ($tag['is_focus_area']) {
                    $this->focus_areas[] = $tag['raw_tag'];
                    $this->existing_fa_map[strtolower($tag['raw_tag'])] = true;
                } else {
                    $this->existing_tags[] = $tag['raw_tag'];
                    $this->existing_map[strtolower($tag['raw_tag'])] = true;
                }
            }
        } else {
            foreach ($existing as $tag) {
                if (!is_null($tag[2])) {
                    $this->existing_fa_map[strtolower($tag[0])] = true;
                } else {
                    $this->existing_tags[] = $tag[0];
                    $this->existing_map[strtolower($tag[0])] = true;
                }
            }
        }
        $dbh->setQuery('SELECT lower(raw_tag) AS raw_tag, CASE WHEN to1.id IS NULL THEN 0 ELSE 1 END AS is_endorsed
			FROM #__tags t
			LEFT JOIN #__tags_object to1 ON to1.tbl = \'tags\' AND to1.objectid = t.id AND to1.label = \'label\' AND to1.tagid = (SELECT id FROM #__tags WHERE tag = \'endorsed\')');
        $tags = array();
        foreach ($dbh->loadAssocList() as $row) {
            $tags[Inflect::singularize($row['raw_tag'])] = $row['is_endorsed'] ? self::ENDORSED_TAG : self::REGULAR_TAG;
            $tags[Inflect::pluralize($row['raw_tag'])] = $row['is_endorsed'] ? self::ENDORSED_TAG : self::REGULAR_TAG;
        }
        $dbh->setQuery('SELECT body FROM #__resource_assoc ra
			LEFT JOIN #__document_resource_rel drr ON drr.resource_id = ra.child_id
			INNER JOIN #__document_text_data dtd ON dtd.id = drr.document_id
			WHERE ra.parent_id = ' . $rid);
        $words = preg_split('/\\W+/', join(' ', $dbh->loadColumn()));
        $word_count = count($words);
        if (!$words[$word_count - 1]) {
            array_pop($words);
            --$word_count;
        }
        $freq = array();
        $last = array();
        foreach ($words as $idx => $word) {
            if (self::is_stop_word($word, $opts['min_len'])) {
                continue;
            }
            $stems = array(array(stem($word), strtolower($word)));
            if (isset($words[$idx + 1]) && !self::is_stop_word($words[$idx + 1], $opts['min_len'])) {
                $stems[] = array($stems[0][0] . ' ' . stem($words[$idx + 1]), strtolower($word) . ' ' . strtolower($words[$idx + 1]));
            }
            if (isset($words[$idx + 2]) && !self::is_stop_word($words[$idx + 2], $opts['min_len'])) {
                $stems[] = array($stems[0][0] . ' ' . stem($words[$idx + 1]) . ' ' . stem($words[$idx + 2]), Inflect::singularize(strtolower($word)) . ' ' . strtolower($words[$idx + 1]) . ' ' . strtolower($words[$idx + 2]));
            }
            foreach ($stems as $set_idx => $set) {
                list($stem, $word) = $set;
                if (isset($this->existing_map[strtolower($word)]) || isset($this->focus_area_map[strtolower($word)])) {
                    continue;
                }
                if (!isset($freq[$stem])) {
                    $freq[$stem] = array('text' => $word, 'count' => 0);
                } else {
                    $freq[$stem]['count'] += ($idx - $last[$stem]) / $word_count * ($set_idx + 1);
                }
                $last[$stem] = $idx;
            }
        }
        foreach ($freq as $stem => $def) {
            foreach (array($stem, $def['text']) as $text) {
                if (isset($tags[$text])) {
                    $freq[$stem]['count'] += $tags[$text] === self::ENDORSED_TAG ? 3 : 1.5;
                    break;
                }
            }
        }
        usort($freq, create_function('$a, $b', 'return $a[\'count\'] === $b[\'count\'] ? 0 : ($a[\'count\'] > $b[\'count\'] ? -1 : 1);'));
        $this->tags = array_slice($freq, 0, $opts['count']);
    }
Example #5
0
 /**
  * Short description for 'get_stemmed_chunks'
  *
  * Long description (if any) ...
  *
  * @return     mixed Return description (if any) ...
  */
 public function get_stemmed_chunks()
 {
     $chunks = $this->get_positive_chunks();
     foreach ($chunks as $term) {
         while (($stemmed = stem($term)) != $term) {
             $chunks[] = $stemmed;
             $term = $stemmed;
         }
     }
     $chunks = array_unique(array_merge(array_map('stem', $chunks), $chunks));
     \Event::trigger('onSearchExpandTerms', array(&$chunks));
     return array_unique($chunks);
 }
 /**
  * Stems a keyword
  *
  * The basic idea behind stemmming is described on the Wikipedia article on
  * {@link http://en.wikipedia.org/wiki/Stemming Stemming}.
  *
  * If the PECL <code>stem</code> package is loaded, English stemming is
  * performed on the <code>$keyword</code>. See
  * {@link http://pecl.php.net/package/stem/} for details about the PECL
  * stem package.
  *
  * Otherwise, if a <code>PorterStemmer</code< class is defined, it is
  * applied to the <code>$keyword</code>. The most commonly available PHP
  * implementation of the Porter-stemmer algorithm is licenced under the
  * GPL, and is thus not distributable with the LGPL licensed NateGoSearch.
  *
  * If no stemming is available, stemming is not performed and the original
  * keyword is returned.
  *
  * @param string $keyword the keyword to stem.
  *
  * @return string the stemmed keyword.
  */
 public static function stemKeyword($keyword)
 {
     if (extension_loaded('stem')) {
         $keyword = stem($keyword, STEM_ENGLISH);
     } elseif (is_callable(array('PorterStemmer', 'Stem'))) {
         $keyword = PorterStemmer::Stem($keyword);
     }
     return $keyword;
 }
Example #7
0
function unique_array($arr)
{
    global $min_word_length;
    global $common;
    global $word_upper_bound;
    global $index_numbers, $stem_words;
    if ($stem_words == 1) {
        $newarr = array();
        foreach ($arr as $val) {
            $newarr[] = stem($val);
        }
        $arr = $newarr;
    }
    sort($arr);
    reset($arr);
    $newarr = array();
    $i = 0;
    $counter = 1;
    $element = current($arr);
    if ($index_numbers == 1) {
        $pattern = "/[a-z0-9]+/";
    } else {
        $pattern = "/[a-z]+/";
    }
    $regs = array();
    for ($n = 0; $n < sizeof($arr); $n++) {
        //check if word is long enough, contains alphabetic characters and is not a common word
        //to eliminate/count multiple instance of words
        $next_in_arr = next($arr);
        if ($next_in_arr != $element) {
            if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && @$common[$element] != 1) {
                if (preg_match("/^(-|\\\\')(.*)/", $element, $regs)) {
                    $element = $regs[2];
                }
                if (preg_match("/(.*)(\\\\'|-)\$/", $element, $regs)) {
                    $element = $regs[1];
                }
                $newarr[$i][1] = $element;
                $newarr[$i][2] = $counter;
                $element = current($arr);
                $i++;
                $counter = 1;
            } else {
                $element = $next_in_arr;
            }
        } else {
            if ($counter < $word_upper_bound) {
                $counter++;
            }
        }
    }
    return $newarr;
}
Example #8
0
function search($searchstr, $category, $start, $per_page, $type, $domain)
{
    global $length_of_link_desc, $mysql_table_prefix, $show_meta_description, $merge_site_results, $stem_words, $did_you_mean_enabled;
    $possible_to_find = 1;
    $result = mysql_query("select domain_id from " . $mysql_table_prefix . "domains where domain = '{$domain}'");
    if (mysql_num_rows($result) > 0) {
        $thisrow = mysql_fetch_array($result);
        $domain_qry = "and domain = " . $thisrow[0];
    } else {
        $domain_qry = "";
    }
    //find all sites that should not be included in the result
    if (count($searchstr['+']) == 0) {
        return null;
    }
    $wordarray = $searchstr['-'];
    $notlist = array();
    $not_words = 0;
    while ($not_words < count($wordarray)) {
        if ($stem_words == 1) {
            $searchword = addslashes(stem($wordarray[$not_words]));
        } else {
            $searchword = addslashes($wordarray[$not_words]);
        }
        $wordmd5 = substr(md5($searchword), 0, 1);
        $query1 = "SELECT link_id from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}'";
        $result = mysql_query($query1);
        while ($row = mysql_fetch_row($result)) {
            $notlist[$not_words]['id'][$row[0]] = 1;
        }
        $not_words++;
    }
    //find all sites containing the search phrase
    $wordarray = $searchstr['+s'];
    $phrase_words = 0;
    while ($phrase_words < count($wordarray)) {
        $searchword = addslashes($wordarray[$phrase_words]);
        $query1 = "SELECT link_id from " . $mysql_table_prefix . "links where fulltxt like '% {$searchword}%'";
        echo mysql_error();
        $result = mysql_query($query1);
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            $possible_to_find = 0;
            break;
        }
        while ($row = mysql_fetch_row($result)) {
            $phraselist[$phrase_words]['id'][$row[0]] = 1;
        }
        $phrase_words++;
    }
    if ($category > 0 && $possible_to_find == 1) {
        $allcats = get_cats($category);
        $catlist = implode(",", $allcats);
        $query1 = "select link_id from " . $mysql_table_prefix . "links, " . $mysql_table_prefix . "sites, " . $mysql_table_prefix . "categories, " . $mysql_table_prefix . "site_category where " . $mysql_table_prefix . "links.site_id = " . $mysql_table_prefix . "sites.site_id and " . $mysql_table_prefix . "sites.site_id = " . $mysql_table_prefix . "site_category.site_id and " . $mysql_table_prefix . "site_category.category_id in ({$catlist})";
        $result = mysql_query($query1);
        echo mysql_error();
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            $possible_to_find = 0;
        }
        while ($row = mysql_fetch_row($result)) {
            $category_list[$row[0]] = 1;
        }
    }
    //find all sites that include the search word
    $wordarray = $searchstr['+'];
    $words = 0;
    $starttime = getmicrotime();
    while ($words < count($wordarray) && $possible_to_find == 1) {
        if ($stem_words == 1) {
            $searchword = addslashes(stem($wordarray[$words]));
        } else {
            $searchword = addslashes($wordarray[$words]);
        }
        $wordmd5 = substr(md5($searchword), 0, 1);
        $query1 = "SELECT distinct link_id, weight, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}' {$domain_qry} order by weight desc";
        echo mysql_error();
        $result = mysql_query($query1);
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            if ($type != "or") {
                $possible_to_find = 0;
                break;
            }
        }
        if ($type == "or") {
            $indx = 0;
        } else {
            $indx = $words;
        }
        while ($row = mysql_fetch_row($result)) {
            $linklist[$indx]['id'][] = $row[0];
            $domains[$row[0]] = $row[2];
            $linklist[$indx]['weight'][$row[0]] = $row[1];
        }
        $words++;
    }
    if ($type == "or") {
        $words = 1;
    }
    $result_array_full = array();
    if ($possible_to_find != 0) {
        if ($words == 1 && $not_words == 0 && $category < 1) {
            //if there is only one search word, we already have the result
            $result_array_full = $linklist[0]['weight'];
        } else {
            //otherwise build an intersection of all the results
            $j = 1;
            $min = 0;
            while ($j < $words) {
                if (count($linklist[$min]['id']) > count($linklist[$j]['id'])) {
                    $min = $j;
                }
                $j++;
            }
            $j = 0;
            $temp_array = $linklist[$min]['id'];
            $count = 0;
            while ($j < count($temp_array)) {
                $k = 0;
                //and word counter
                $n = 0;
                //not word counter
                $o = 0;
                //phrase word counter
                $weight = 1;
                $break = 0;
                while ($k < $words && $break == 0) {
                    if ($linklist[$k]['weight'][$temp_array[$j]] > 0) {
                        $weight = $weight + $linklist[$k]['weight'][$temp_array[$j]];
                    } else {
                        $break = 1;
                    }
                    $k++;
                }
                while ($n < $not_words && $break == 0) {
                    if ($notlist[$n]['id'][$temp_array[$j]] > 0) {
                        $break = 1;
                    }
                    $n++;
                }
                while ($o < $phrase_words && $break == 0) {
                    if ($phraselist[$n]['id'][$temp_array[$j]] != 1) {
                        $break = 1;
                    }
                    $o++;
                }
                if ($break == 0 && $category > 0 && $category_list[$temp_array[$j]] != 1) {
                    $break = 1;
                }
                if ($break == 0) {
                    $result_array_full[$temp_array[$j]] = $weight;
                    $count++;
                }
                $j++;
            }
        }
    }
    $end = getmicrotime() - $starttime;
    if ((count($result_array_full) == 0 || $possible_to_find == 0) && $did_you_mean_enabled == 1) {
        reset($searchstr['+']);
        foreach ($searchstr['+'] as $word) {
            $word = addslashes($word);
            $result = mysql_query("select keyword from " . $mysql_table_prefix . "keywords where soundex(keyword) = soundex('{$word}')");
            $max_distance = 100;
            $near_word = "";
            while ($row = mysql_fetch_row($result)) {
                $distance = levenshtein($row[0], $word);
                if ($distance < $max_distance && $distance < 4) {
                    $max_distance = $distance;
                    $near_word = $row[0];
                }
            }
            if ($near_word != "" && $word != $near_word) {
                $near_words[$word] = $near_word;
            }
        }
        $res['did_you_mean'] = $near_words;
        return $res;
    }
    if (count($result_array_full) == 0) {
        return null;
    }
    arsort($result_array_full);
    if ($merge_site_results == 1 && $domain_qry == "") {
        while (list($key, $value) = each($result_array_full)) {
            if (!isset($domains_to_show[$domains[$key]])) {
                $result_array_temp[$key] = $value;
                $domains_to_show[$domains[$key]] = 1;
            } else {
                if ($domains_to_show[$domains[$key]] == 1) {
                    $domains_to_show[$domains[$key]] = array($key => $value);
                }
            }
        }
    } else {
        $result_array_temp = $result_array_full;
    }
    while (list($key, $value) = each($result_array_temp)) {
        $result_array[$key] = $value;
        if (isset($domains_to_show[$domains[$key]]) && $domains_to_show[$domains[$key]] != 1) {
            list($k, $v) = each($domains_to_show[$domains[$key]]);
            $result_array[$k] = $v;
        }
    }
    $results = count($result_array);
    $keys = array_keys($result_array);
    $maxweight = $result_array[$keys[0]];
    for ($i = ($start - 1) * $per_page; $i < min($results, ($start - 1) * $per_page + $per_page); $i++) {
        $in[] = $keys[$i];
    }
    if (!is_array($in)) {
        $res['results'] = $results;
        return $res;
    }
    $inlist = implode(",", $in);
    if ($length_of_link_desc == 0) {
        $fulltxt = "fulltxt";
    } else {
        $fulltxt = "substring(fulltxt, 1, {$length_of_link_desc})";
    }
    $query1 = "SELECT distinct link_id, url, title, description,  {$fulltxt}, size FROM " . $mysql_table_prefix . "links WHERE link_id in ({$inlist})";
    $result = mysql_query($query1);
    echo mysql_error();
    $i = 0;
    while ($row = mysql_fetch_row($result)) {
        $res[$i]['title'] = $row[2];
        $res[$i]['url'] = $row[1];
        if ($row[3] != null && $show_meta_description == 1) {
            $res[$i]['fulltxt'] = $row[3];
        } else {
            $res[$i]['fulltxt'] = $row[4];
        }
        $res[$i]['size'] = $row[5];
        $res[$i]['weight'] = $result_array[$row[0]];
        $dom_result = mysql_query("select domain from " . $mysql_table_prefix . "domains where domain_id='" . $domains[$row[0]] . "'");
        $dom_row = mysql_fetch_row($dom_result);
        $res[$i]['domain'] = $dom_row[0];
        $i++;
    }
    if ($merge_site_results && $domain_qry == "") {
        sort_with_domains($res);
    } else {
        usort($res, "cmp");
    }
    echo mysql_error();
    $res['maxweight'] = $maxweight;
    $res['results'] = $results;
    return $res;
    /**/
}
function search($searchstr, $category, $start, $per_page, $type, $domain)
{
    global $length_of_link_desc, $show_meta_description, $merge_site_results, $stem_words;
    global $did_you_mean_enabled, $did_you_mean_always;
    global $matchless, $equivalent, $language;
    global $db;
    $possible_to_find = 1;
    $stat = $db->prepare("SELECT domain_id FROM " . TABLE_PREFIX . "domains WHERE domain = :domain");
    $stat->execute(array(':domain' => $domain));
    if ($row = $stat->fetch()) {
        $domain_qry = "and domain = " . $row[0];
    } else {
        $domain_qry = "";
    }
    $stat->closeCursor();
    /* if there are no words to search for, quit */
    if (!isset($searchstr['+']) || count($searchstr['+']) == 0) {
        return null;
    }
    /* find all words that should _not_ be included in the result */
    if (isset($searchstr['-'])) {
        $wordarray = $searchstr['-'];
    } else {
        $wordarray = array();
    }
    $notlist = array();
    $not_words = 0;
    while ($not_words < count($wordarray)) {
        if ($stem_words == 1) {
            $searchword = stem($wordarray[$not_words]);
        } else {
            $searchword = $wordarray[$not_words];
        }
        $wordmd5 = substr(md5($searchword), 0, 1);
        $stat = $db->prepare("SELECT link_id from " . TABLE_PREFIX . "link_keyword{$wordmd5}, " . TABLE_PREFIX . "keywords where " . TABLE_PREFIX . "link_keyword{$wordmd5}.keyword_id= " . TABLE_PREFIX . "keywords.keyword_id and keyword = :keyword");
        $stat->execute(array(':keyword' => $searchword));
        while ($row = $stat->fetch()) {
            $notlist[$not_words]['id'][$row[0]] = 1;
        }
        $not_words++;
    }
    /* find all phrases */
    if (isset($searchstr['+s'])) {
        $wordarray = $searchstr['+s'];
    } else {
        $wordarray = array();
    }
    $phrase_words = 0;
    while ($phrase_words < count($wordarray)) {
        $searchword = $wordarray[$phrase_words];
        $searchword = str_replace("|", "", $searchword);
        $searchword = str_replace("%", "|%", $searchword);
        $searchword = str_replace("_", "|_", $searchword);
        $stat = $db->prepare("SELECT link_id from " . TABLE_PREFIX . "links where fulltxt like :keyword escape '|'");
        $stat->execute(array(':keyword' => "%" . $searchword . "%"));
        echo sql_errorstring(__FILE__, __LINE__);
        $row = $stat->fetch();
        if (!$row) {
            $possible_to_find = 0;
            $stat->closeCursor();
            break;
        }
        $phraselist[$phrase_words]['id'][$row[0]] = 1;
        while ($row = $stat->fetch()) {
            $phraselist[$phrase_words]['id'][$row[0]] = 1;
        }
        $phrase_words++;
    }
    if ($category > 0 && $possible_to_find == 1) {
        $allcats = get_cats($category);
        $catlist = implode(",", $allcats);
        $result = $db->query("SELECT link_id FROM " . TABLE_PREFIX . "links, " . TABLE_PREFIX . "sites, " . TABLE_PREFIX . "categories, " . TABLE_PREFIX . "site_category where " . TABLE_PREFIX . "links.site_id = " . TABLE_PREFIX . "sites.site_id and " . TABLE_PREFIX . "sites.site_id = " . TABLE_PREFIX . "site_category.site_id and " . TABLE_PREFIX . "site_category.category_id in ({$catlist})");
        echo sql_errorstring(__FILE__, __LINE__);
        $row = $result->fetch();
        if (!$row) {
            $possible_to_find = 0;
        } else {
            $category_list[$row[0]] = 1;
            while ($row = $result->fetch()) {
                $category_list[$row[0]] = 1;
            }
        }
        $result->closeCursor();
    }
    /* find individual words */
    $word_not_found = array();
    $wordarray = $searchstr['+'];
    $words = 0;
    while ($words < count($wordarray) && $possible_to_find == 1) {
        if ($stem_words == 1) {
            $searchword = stem($wordarray[$words]);
        } else {
            $searchword = $wordarray[$words];
        }
        $wordmd5 = substr(md5($searchword), 0, 1);
        $stat = $db->prepare("SELECT distinct link_id, weight, domain FROM " . TABLE_PREFIX . "link_keyword{$wordmd5}, " . TABLE_PREFIX . "keywords WHERE " . TABLE_PREFIX . "link_keyword{$wordmd5}.keyword_id= " . TABLE_PREFIX . "keywords.keyword_id AND keyword=:keyword {$domain_qry}\tORDER\tBY\tweight\tDESC");
        $stat->execute(array(':keyword' => $searchword));
        echo sql_errorstring(__FILE__, __LINE__);
        $row = $stat->fetch();
        if (!$row) {
            $word_not_found[$wordarray[$words]] = 1;
            if ($type != "or") {
                $possible_to_find = 0;
                $stat->closeCursor();
                break;
            }
        }
        if ($type == "or") {
            $indx = 0;
        } else {
            $indx = $words;
        }
        do {
            $linklist[$indx]['id'][] = $row[0];
            $domains[$row[0]] = $row[2];
            $linklist[$indx]['weight'][$row[0]] = $row[1];
        } while ($row = $stat->fetch());
        $words++;
    }
    if ($type == "or") {
        $words = 1;
    }
    $result_array_full = array();
    if ($possible_to_find != 0) {
        if ($words == 1 && $not_words == 0 && $category < 1) {
            //if there is only one search word, we already have the result
            $result_array_full = $linklist[0]['weight'];
        } else {
            //otherwise build an intersection of all the results
            $j = 1;
            $min = 0;
            while ($j < $words) {
                if (count($linklist[$min]['id']) > count($linklist[$j]['id'])) {
                    $min = $j;
                }
                $j++;
            }
            $j = 0;
            $temp_array = $linklist[$min]['id'];
            $count = 0;
            while ($j < count($temp_array)) {
                $k = 0;
                //and word counter
                $n = 0;
                //not word counter
                $o = 0;
                //phrase word counter
                $weight = 1;
                $break = 0;
                while ($k < $words && $break == 0) {
                    if (isset($linklist[$k]['weight'][$temp_array[$j]]) && $linklist[$k]['weight'][$temp_array[$j]] > 0) {
                        $weight = $weight + $linklist[$k]['weight'][$temp_array[$j]];
                    } else {
                        $break = 1;
                    }
                    $k++;
                }
                while ($n < $not_words && $break == 0) {
                    if ($notlist[$n]['id'][$temp_array[$j]] > 0) {
                        $break = 1;
                    }
                    $n++;
                }
                while ($o < $phrase_words && $break == 0) {
                    if (!isset($phraselist[$n]['id'][$temp_array[$j]]) || $phraselist[$n]['id'][$temp_array[$j]] != 1) {
                        $break = 1;
                    }
                    $o++;
                }
                if ($break == 0 && $category > 0 && $category_list[$temp_array[$j]] != 1) {
                    $break = 1;
                }
                if ($break == 0) {
                    $result_array_full[$temp_array[$j]] = $weight;
                    $count++;
                }
                $j++;
            }
        }
    }
    if ((count($result_array_full) == 0 || $possible_to_find == 0 || $did_you_mean_always == 1) && $did_you_mean_enabled == 1) {
        /* search for word pairs written as two words where a single words
           for example: when the user typed "full colour", also search for
           fullcolour and full-colour */
        for ($idx = 0; $idx < count($searchstr['+']) - 1; $idx++) {
            $word = $searchstr['+'][$idx] . " " . $searchstr['+'][$idx + 1];
            $near_word = $searchstr['+'][$idx] . $searchstr['+'][$idx + 1];
            /* words that are in the "nonpareil" list are excluded in searching
               for alternatives */
            if (!isset($matchless[$near_word])) {
                $stat = $db->prepare("SELECT keyword FROM " . TABLE_PREFIX . "keywords WHERE keyword=:keyword");
                if ($stat->execute(array(':keyword' => $near_word)) && ($row = $stat->fetch())) {
                    $near_words[$word] = latin1_to_html($near_word);
                    $stat->closeCursor();
                }
            }
            $near_word = $searchstr['+'][$idx] . "-" . $searchstr['+'][$idx + 1];
            if (!isset($matchless[$near_word])) {
                $stat = $db->prepare("SELECT keyword FROM " . TABLE_PREFIX . "keywords WHERE keyword=:keyword");
                if ($stat->execute(array(':keyword' => $near_word)) && ($row = $stat->fetch())) {
                    $near_words[$word] = latin1_to_html($near_word);
                    $stat->closeCursor();
                }
            }
        }
        /* then search for "near words" for the individual words */
        reset($searchstr['+']);
        foreach ($searchstr['+'] as $word) {
            /* words that are in the "nonpareil" list are excluded in searching
               for alternatives */
            if (isset($matchless[$word]) && $matchless[$word] == 1) {
                continue;
            }
            /* search for alternatives in the explicit equivalents word list first */
            if (isset($equivalent[$word]) && strlen($equivalent[$word]) > 0) {
                $near_words[$word] = latin1_to_html($equivalent[$word]);
                continue;
            }
            /* if there are misspelled words, show only alternatives for the
               misspelled words, (so, if the current word is not in the list
               of misspelled words, exclude it from the search for alternatives */
            if (count($word_not_found) > 0 && !(isset($word_not_found[$word]) && $word_not_found[$word] == 1)) {
                continue;
            }
            $word = sanitize($word);
            /* use the double-metaphone to find close words */
            $meta = double_metaphone($word);
            if (!isset($meta["primary"]) || strlen($meta["primary"]) == 0) {
                continue;
            }
            /* no metaphone, don't match anything */
            $where = "metaphone1='" . $meta["primary"] . "' OR metaphone2='" . $meta["primary"] . "'";
            if (isset($meta["secondary"]) && strlen($meta["secondary"]) > 0) {
                $where .= " OR metaphone1='" . $meta["secondary"] . "' OR metaphone2='" . $meta["secondary"] . "'";
            }
            $result = $db->query("SELECT keyword FROM " . TABLE_PREFIX . "keywords WHERE {$where}");
            /* adapted from http://www.mdj.us/web-development/php-programming/creating-better-search-suggestions-with-sphider/
               but using a double-metaphone filter (instead of SOUNDEX) and
               adding a filter for accented characters */
            $max_distance = 3;
            $max_similar = 0;
            $near_word = "";
            while ($result && ($row = $result->fetch())) {
                $item = $row[0];
                if (strcasecmp($item, $word) != 0) {
                    $distance = levenshtein($item, $word);
                    $distance_na = levenshtein(remove_accents($item), $word);
                    if ($distance_na < $distance) {
                        $distance = $distance_na;
                    }
                    if ($distance < $max_distance) {
                        $max_distance = $distance;
                        $near_word = $item;
                    }
                    if ($distance == $max_distance) {
                        $similar = similar_text($item, $word);
                        if ($similar >= $max_similar) {
                            $max_distance = $distance;
                            $max_similar = $similar;
                            $near_word = $item;
                        }
                    }
                }
            }
            if ($near_word != "") {
                $near_words[$word] = latin1_to_html($near_word);
            } else {
                if (isset($word_not_found[$word]) && $word_not_found[$word] == 1 && count($wordarray) > 1) {
                    $near_words[$word] = "/{$word}";
                }
            }
        }
        if (!isset($near_words)) {
            $near_words = "";
        }
        $res['did_you_mean'] = $near_words;
        if (count($result_array_full) == 0 || $possible_to_find == 0) {
            return $res;
        }
    }
    if (count($result_array_full) == 0) {
        return null;
    }
    arsort($result_array_full);
    if ($merge_site_results == 1 && $domain_qry == "") {
        while (list($key, $value) = each($result_array_full)) {
            if (!isset($domains_to_show[$domains[$key]])) {
                $result_array_temp[$key] = $value;
                $domains_to_show[$domains[$key]] = 1;
            } else {
                if ($domains_to_show[$domains[$key]] == 1) {
                    $domains_to_show[$domains[$key]] = array($key => $value);
                }
            }
        }
    } else {
        $result_array_temp = $result_array_full;
    }
    while (list($key, $value) = each($result_array_temp)) {
        $result_array[$key] = $value;
        if (isset($domains_to_show[$domains[$key]]) && $domains_to_show[$domains[$key]] != 1) {
            list($k, $v) = each($domains_to_show[$domains[$key]]);
            $result_array[$k] = $v;
        }
    }
    $results = count($result_array);
    $keys = array_keys($result_array);
    $maxweight = $result_array[$keys[0]];
    for ($i = ($start - 1) * $per_page; $i < min($results, ($start - 1) * $per_page + $per_page); $i++) {
        $in[] = $keys[$i];
    }
    if (!is_array($in)) {
        $res['results'] = $results;
        return $res;
    }
    $inlist = implode(",", $in);
    if ($length_of_link_desc == 0) {
        $fulltxt = "fulltxt";
    } else {
        $fulltxt = "substring(fulltxt, 1, {$length_of_link_desc})";
    }
    $query = "SELECT distinct link_id, url, title, description, language, {$fulltxt}, size FROM " . TABLE_PREFIX . "links WHERE link_id in ({$inlist})";
    $result = $db->query($query);
    echo sql_errorstring(__FILE__, __LINE__);
    $i = 0;
    while ($row = $result->fetch()) {
        $res[$i]['title'] = $row[2];
        $res[$i]['url'] = $row[1];
        if (isset($row[3]) && $row[3] != null && $show_meta_description == 1) {
            $res[$i]['summary'] = $row[3];
        } else {
            $res[$i]['summary'] = "";
        }
        $res[$i]['lang'] = $row[4];
        $res[$i]['fulltxt'] = $row[5];
        $res[$i]['size'] = $row[6];
        $res[$i]['weight'] = $result_array[$row[0]];
        /* if a language has been set for this page, and it is _not_ the
         * same language as the user language, decrease the weight
         */
        if (isset($row[4]) && $row[4] != null && strlen($row[4]) > 0 && strcasecmp($row[4], $language) != 0) {
            $res[$i]['weight'] *= 0.5;
        }
        $dom_result = $db->query("select domain from " . TABLE_PREFIX . "domains where domain_id='" . $domains[$row[0]] . "'");
        $dom_row = $dom_result->fetch();
        $res[$i]['domain'] = $dom_row[0];
        $i++;
    }
    if ($merge_site_results && $domain_qry == "") {
        sort_with_domains($res);
    } else {
        usort($res, "cmp");
    }
    echo sql_errorstring(__FILE__, __LINE__);
    /* sorting destroys the other columns in the array, restore */
    if (isset($near_words)) {
        $res['did_you_mean'] = $near_words;
    }
    $res['maxweight'] = $maxweight;
    $res['results'] = $results;
    return $res;
    /**/
}
Example #10
0
 /**
  *  Takes a word and returns it reduced to its stem.
  *
  *  Non-alphanumerics and hyphens are removed, except for dots and
  *  apostrophes, and if the word is less than three characters in
  *  length, it will be stemmed according to the five-step
  *  Porter stemming algorithm.
  *
  *  Note special cases here: hyphenated words (such as half-life) will
  *  only have the base after the last hyphen stemmed (so half-life would
  *  only have "life" subject to stemming). Handles multi-hyphenated
  *  words, too.
  *
  *  @param string $word Word to reduce
  *  @access public
  *  @return string Stemmed word
  */
 public function stem($word, $lang = 'en')
 {
     if (empty($word)) {
         return false;
     }
     $orig_word = $word;
     if (isset($this->opa_stem_cache[$word])) {
         return $this->opa_stem_cache[$word];
     }
     // Use PECL function if it is installed
     if (function_exists('stem')) {
         $this->opa_stem_cache[$word] = stem($word, $this->lang2code($lang));
         return $this->opa_stem_cache[$word];
     }
     $result = '';
     $word = strtolower(caRemoveAccents($word));
     // Strip punctuation, etc. Keep ' and . for URLs and contractions.
     if (substr($word, -2) == "'s") {
         $word = substr($word, 0, -2);
     }
     if (function_exists('iconv')) {
         $word = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $word);
     }
     $word = preg_replace("/[^a-z0-9'.-]/", '', $word);
     $first = '';
     if (strpos($word, '-') !== false) {
         //list($first, $word) = explode('-', $word);
         //$first .= '-';
         $first = substr($word, 0, strrpos($word, '-') + 1);
         // Grabs hyphen too
         $word = substr($word, strrpos($word, '-') + 1);
     }
     if (strlen($word) > 2) {
         $word = $this->_step_1($word);
         $word = $this->_step_2($word);
         $word = $this->_step_3($word);
         $word = $this->_step_4($word);
         $word = $this->_step_5($word);
     }
     $result = $first . $word;
     $this->opa_stem_cache[$orig_word] = $result;
     return $result;
 }
Example #11
0
function search($searchstr, $category, $start, $per_page, $type, $domain)
{
    global $length_of_link_desc, $mysql_table_prefix, $show_meta_description, $sort_results, $query_hits;
    global $stem_words, $did_you_mean_enabled, $relevance, $query, $utf8, $wildcount, $type, $case_sensitive;
    $possible_to_find = 1;
    $result = mysql_query("select domain_id from " . $mysql_table_prefix . "domains where domain = '{$domain}'");
    if (mysql_num_rows($result) > 0) {
        $thisrow = mysql_fetch_array($result);
        $domain_qry = "and domain = " . $thisrow[0];
    } else {
        $domain_qry = "";
    }
    //find all sites that should not be included in the result
    if (count($searchstr['+']) == 0) {
        return null;
    }
    $wordarray = $searchstr['-'];
    $notlist = array();
    $not_words = 0;
    while ($not_words < count($wordarray)) {
        if ($stem_words == 1) {
            $searchword = addslashes(stem($wordarray[$not_words]));
        } else {
            $searchword = addslashes($wordarray[$not_words]);
        }
        $wordmd5 = substr(md5($searchword), 0, 1);
        $query1 = "SELECT link_id from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}'";
        $result = mysql_query($query1);
        while ($row = mysql_fetch_row($result)) {
            $notlist[$not_words]['id'][$row[0]] = 1;
        }
        $not_words++;
    }
    //find all sites containing the search phrase
    $wordarray = $searchstr['+s'];
    $phrase_words = 0;
    while ($phrase_words < count($wordarray)) {
        $searchword = addslashes($wordarray[$phrase_words]);
        $phrase_query = $searchword;
        if ($case_sensitive == '1') {
            $query1 = "SELECT link_id from " . $mysql_table_prefix . "links where fulltxt like '% {$searchword}%'";
        }
        if ($case_sensitive == '0') {
            $searchword = lower_case($searchword);
            $query1 = "SELECT link_id from " . $mysql_table_prefix . "links where CONVERT(LOWER(fulltxt)USING utf8) like '% {$searchword}%'";
        }
        /*
                   if ($utf8 =='0') {
                       $searchword = lower_case($searchword);
                       $query1 = "SELECT link_id from ".$mysql_table_prefix."links where LOWER(fulltxt) like '% $searchword%'";
                   } 
        */
        $result = mysql_query($query1);
        echo mysql_error();
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            $possible_to_find = 0;
            break;
        }
        while ($row = mysql_fetch_row($result)) {
            $value = $row[0];
            $phraselist[$phrase_words]['id'][$row[0]] = 1;
            $phraselist[$phrase_words]['val'][$row[0]] = $value;
        }
        $phrase_words++;
    }
    if ($category > 0 && $possible_to_find == 1) {
        $allcats = get_cats($category);
        $catlist = implode(",", $allcats);
        $query1 = "select link_id from " . $mysql_table_prefix . "links, " . $mysql_table_prefix . "sites, " . $mysql_table_prefix . "categories, " . $mysql_table_prefix . "site_category where " . $mysql_table_prefix . "links.site_id = " . $mysql_table_prefix . "sites.site_id and " . $mysql_table_prefix . "sites.site_id = " . $mysql_table_prefix . "site_category.site_id and " . $mysql_table_prefix . "site_category.category_id in ({$catlist})";
        $result = mysql_query($query1);
        echo mysql_error();
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            $possible_to_find = 0;
        }
        while ($row = mysql_fetch_row($result)) {
            $category_list[$row[0]] = 1;
        }
    }
    //find all sites that include the search word
    $wordarray = $searchstr['+'];
    $words = 0;
    $starttime = getmicrotime();
    $searchword = addslashes($wordarray[$words]);
    //  get only first word of search query
    $strictpos = strpos($searchword, '!');
    //   if  ! is in position 0, we have to search strict
    if ($strictpos === 0) {
        //    ****        for 'Strict search' enter here
        $searchword = str_replace('!', '', $searchword);
        $query = "SELECT keyword_id, keyword from " . $mysql_table_prefix . "keywords where keyword = '{$searchword}'";
        echo mysql_error();
        $result = mysql_query($query);
        $num_rows = mysql_num_rows($result);
        if ($num_rows == 0) {
            // if there was no searchword in table keywords
            $possible_to_find = 0;
            $break = 1;
        }
        if ($num_rows != 0) {
            // get all searchwords as keywords from table keywords
            $keyword_id = mysql_result($result, $i, "keyword_id");
            $keyword = mysql_result($result, $i, "keyword");
            $wordmd5 = substr(md5($keyword), 0, 1);
            // calculate attribute for link_keyword table
            if ($query_hits == '1') {
                //      get query hit results
                $query1 = "SELECT distinct link_id, hits, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}' {$domain_qry} order by hits desc";
            } else {
                // get weight results
                $query1 = "SELECT link_id, weight, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}  where keyword_id = '{$keyword_id}' order by weight desc";
            }
            echo mysql_error();
            $reso = mysql_query($query1);
            $lines = mysql_num_rows($reso);
            if ($lines != 0) {
                $indx = $words;
            }
            while ($row = mysql_fetch_row($reso)) {
                $linklist[$indx]['id'][] = $row[0];
                $domains[$row[0]] = $row[2];
                $linklist[$indx]['weight'][$row[0]] = $row[1];
                if ($query_hits == '1') {
                    //      ensure that result is also available in full text
                    $txt_res = mysql_query("SELECT fulltxt FROM " . $mysql_table_prefix . "links where link_id = '{$row['0']}'");
                    echo mysql_error();
                    $full_txt = mysql_result($txt_res, 0);
                    //       get fulltxt  of this link ID
                    if ($utf8 == '0') {
                        $full_txt = lower_case($full_txt);
                    }
                    $foundit = strpos($full_txt, $searchword);
                    //      get first hit
                    if ($foundit) {
                        $page_hits = $linklist[$indx]['weight'][$row[0]];
                        $i = '0';
                        while ($i < $page_hits) {
                            //      find out if all results in full text are really strict
                            $found_in = strpos($full_txt, $searchword);
                            $tmp_front = substr($full_txt, $found_in - 1, 20);
                            //  one character before found match position
                            $pos = $found_in + strlen($searchword);
                            $tmp_behind = substr($full_txt, $pos, 20);
                            //  one character behind found match position
                            $full_txt = substr($full_txt, $pos);
                            //  get rest of fulltxt
                            //  check whether found match is realy strict
                            $found_before = preg_match("/[(a-z)-_*.\\/\\:&@\\w]/", substr($tmp_front, 0, 1));
                            $found_behind = preg_match("/[(a-z)-_*.,\\/\\:&@\\w]/", substr($tmp_behind, 0, 1));
                            if ($found_before == 1 || $found_behind == 1) {
                                //      correct count of hits
                                $linklist[$indx]['weight'][$row[0]] = $linklist[$indx]['weight'][$row[0]] - 1;
                            }
                            $i++;
                        }
                    } else {
                        $linklist[$indx]['weight'][$row[0]] = '0';
                        //      nothing found in full text. Hits = 0
                    }
                }
            }
            $words++;
        }
    } else {
        //****       if not strict-search try here
        $wild_correct = 0;
        $wildcount = substr_count($searchword, '*');
        if ($wildcount) {
            //  ****        for * wildcard , enter here
            $searchword = str_replace('*', '%', $searchword);
            $words = '0';
            $query = "SELECT keyword_id, keyword from " . $mysql_table_prefix . "keywords where keyword like '{$searchword}'";
            echo mysql_error();
            $result = mysql_query($query);
            $num_rows = mysql_num_rows($result);
            if ($num_rows == 0) {
                // if there was no searchword in table keywords
                $possible_to_find = 0;
                $break = 1;
            }
            if ($num_rows != 0) {
                global $all_wild;
                $all_wild = '';
                for ($i = 0; $i < $num_rows; $i++) {
                    // get all searchwords as keywords from table keywords
                    $keyword_id = mysql_result($result, $i, "keyword_id");
                    $keyword = mysql_result($result, $i, "keyword");
                    $all_wild = "{$all_wild} {$keyword}";
                    $wordmd5 = substr(md5($keyword), 0, 1);
                    // calculate attribute for link_keyword table
                    if ($query_hits == '1') {
                        //      get query hit results
                        $query1 = "SELECT link_id, hits, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}  where keyword_id = '{$keyword_id}' order by hits desc";
                    } else {
                        // get weight results
                        $query1 = "SELECT link_id, weight, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}  where keyword_id = '{$keyword_id}' order by weight desc";
                    }
                    echo mysql_error();
                    $reso = mysql_query($query1);
                    $lines = mysql_num_rows($reso);
                    if ($lines == 0) {
                        if ($type != "or") {
                            $possible_to_find = 0;
                            break;
                        }
                    }
                    if ($type == "or" && $query_hits == '0') {
                        $indx = 0;
                    } else {
                        $indx = $words;
                    }
                    while ($row = mysql_fetch_row($reso)) {
                        $linklist[$indx]['id'][] = $row[0];
                        $domains[$row[0]] = $row[2];
                        $linklist[$indx]['weight'][$row[0]] = $row[1];
                        if ($query_hits == '1') {
                            //      ensure that result is also available in fulltxt
                            $searchword = str_replace("%", '', $searchword);
                            $txt_res = mysql_query("SELECT fulltxt FROM " . $mysql_table_prefix . "links where link_id = '{$row['0']}'");
                            echo mysql_error();
                            $full_txt = mysql_result($txt_res, 0);
                            //       get fulltxt  of this link ID
                            $foundit = strpos($full_txt, $searchword);
                            if (!$foundit) {
                                $linklist[$indx]['weight'][$row[0]] = '0';
                                //      nothing found in full text. Hits = 0
                            }
                        }
                    }
                }
                $words++;
            }
        } else {
            //      if no wildcard, try here
            if ($type == 'tol') {
                //  *****         if tolerant search, enter here
                $acct_a = array("å", "â", "ä", "ä", "Ã\"ž", "Ä", "Ä", "ä", "á", "à", "&agrave;", "á", "&aacute;", "À", "&Agrave;", "Á", "&Aacute;");
                $base_a = array("a", "a", "a", "a", "A", "A", "A", "a", "a", "a", "a", "a", "a", "A", "A", "A", "A");
                $searchword = str_ireplace($acct_a, $base_a, $searchword);
                $acct_e = array("ê", "é", "è", "&egrave;", "é", "&eacute;", "È", "&Egrave;", "É", "&Eacute;");
                $base_e = array("e", "e", "e", "e", "e", "e", "E", "E", "E", "E");
                $searchword = str_ireplace($acct_e, $base_e, $searchword);
                $acct_i = array("ì", "&igrave;", "í", "&iacute;", "Ì", "&Igrave;", "Í", "&Iacute;", "ñ", "¡", "Ã'", "¿");
                $base_i = array("i", "i", "i", "i", "I", "I", "I", "I", "ñ", "¡", "Ñ", "¿");
                $searchword = str_ireplace($acct_i, $base_i, $searchword);
                $acct_o = array("ø", "Ø", "ô", "ó", "ò", "õ", "Ö", "ö", "ö", "ã¶", "ó", "ò", "&ograve;", "ó", "&oacute;", "Ò", "&Ograve;", "Ó", "&Oacute;");
                $base_o = array("o", "O", "o", "o", "o", "o", "O", "o", "o", "o", "Ö", "ö", "O", "o", "o", "O", "O", "O", "O");
                $searchword = str_ireplace($acct_o, $base_o, $searchword);
                $acct_u = array("ù", "ú", "û", "ü", "ü", "ÃÅ\\“", "Ãœ", "Ü", "ü", "ú", "ù", "&ugrave;", "ú", "&uacute;", "Ù", "&Ugrave;", "Ú", "&Uacute;");
                $base_u = array("u", "u", "u", "u", "u", "U", "U", "U", "u", "u", "u", "u", "u", "u", "U", "U", "U", "U");
                $searchword = str_ireplace($acct_u, $base_u, $searchword);
                $get = array("a", "e", "i", "o", "u");
                $out = array("%", "%", "%", "%", "%");
                $searchword = str_ireplace($get, $out, $searchword);
                $query = "SELECT keyword_id, keyword from " . $mysql_table_prefix . "keywords where keyword like '{$searchword}'";
                echo mysql_error();
                $result = mysql_query($query);
                $num_rows = mysql_num_rows($result);
                if ($num_rows == 0) {
                    // if there was no searchword in table keywords
                    $possible_to_find = 0;
                    $break = 1;
                }
                if ($num_rows != 0) {
                    global $all_wild;
                    $all_wild = '';
                    for ($i = 0; $i < $num_rows; $i++) {
                        // get all searchwords as keywords from table keywords
                        $keyword_id = mysql_result($result, $i, "keyword_id");
                        $keyword = mysql_result($result, $i, "keyword");
                        $all_wild = "{$all_wild} {$keyword}";
                        $wordmd5 = substr(md5($keyword), 0, 1);
                        // calculate attribute for link_keyword table
                        if ($query_hits == '1') {
                            //      get query hit results
                            $query1 = "SELECT link_id, hits, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5} where keyword_id = '{$keyword_id}' order by hits desc";
                        } else {
                            // get weight results
                            $query1 = "SELECT link_id, weight, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5} where keyword_id = '{$keyword_id}' order by weight desc";
                        }
                        echo mysql_error();
                        $reso = mysql_query($query1);
                        $lines = mysql_num_rows($reso);
                        if ($lines != 0) {
                            $indx = $words;
                        }
                        while ($row = mysql_fetch_row($reso)) {
                            $linklist[$indx]['id'][] = $row[0];
                            $domains[$row[0]] = $row[2];
                            $linklist[$indx]['weight'][$row[0]] = $row[1];
                        }
                        //$words++;
                    }
                    $words++;
                }
            } else {
                //      finally standard search
                $words = 0;
                while ($words < count($wordarray) && $possible_to_find == 1) {
                    if ($stem_words == 1) {
                        $searchword = addslashes(stem($wordarray[$words]));
                    } else {
                        $searchword = addslashes($wordarray[$words]);
                    }
                    $wordmd5 = substr(md5($searchword), 0, 1);
                    if ($query_hits == '1') {
                        //      get query hit results
                        $query1 = "SELECT distinct link_id, hits, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}' {$domain_qry} order by hits desc";
                    } else {
                        // get weight results
                        $query1 = "SELECT distinct link_id, weight, domain from " . $mysql_table_prefix . "link_keyword{$wordmd5}, " . $mysql_table_prefix . "keywords where " . $mysql_table_prefix . "link_keyword{$wordmd5}.keyword_id= " . $mysql_table_prefix . "keywords.keyword_id and keyword='{$searchword}' {$domain_qry} order by weight desc";
                    }
                    echo mysql_error();
                    $result = mysql_query($query1);
                    $num_rows = mysql_num_rows($result);
                    if ($num_rows == 0) {
                        if ($type != "or") {
                            $possible_to_find = 0;
                            break;
                        }
                    }
                    if ($type == "or" && $query_hits == '0') {
                        $indx = 0;
                    } else {
                        $indx = $words;
                    }
                    while ($row = mysql_fetch_row($result)) {
                        $linklist[$indx]['id'][] = $row[0];
                        $domains[$row[0]] = $row[2];
                        $linklist[$indx]['weight'][$row[0]] = $row[1];
                        if ($query_hits == '1') {
                            //      ensure that result is also available in fulltxt
                            if ($type == 'phrase') {
                                if ($utf8 == '0') {
                                    $searchword = lower_case($phrase_query);
                                    //      get the whole phrase
                                } else {
                                    $searchword = $phrase_query;
                                }
                            }
                            $linklist[$indx]['weight'][$row[0]] = '0';
                            $txt_res = mysql_query("SELECT fulltxt FROM " . $mysql_table_prefix . "links where link_id = '{$row['0']}'");
                            echo mysql_error();
                            $full_txt = mysql_result($txt_res, 0);
                            //       get fulltxt  of this link ID
                            if ($case_sensitive == '0') {
                                $full_txt = lower_case($full_txt);
                            }
                            if (substr_count($full_txt, $searchword)) {
                                //  found complete phrase in full text?
                                $linklist[$indx]['weight'][$row[0]] = substr_count($full_txt, $searchword);
                                //      number of hits found in this full text
                            }
                        }
                    }
                    $words++;
                }
            }
        }
    }
    //  ***** end  different search modes
    if ($type == "or") {
        $words = 1;
    }
    $result_array_full = array();
    if ($words == 1 && $not_words == 0 && $category < 1) {
        // for OR-Sarch without query_hits and one word query, we already have the result
        $result_array_full = $linklist[0]['weight'];
    } else {
        //     otherwise build an intersection of all the results
        $j = 1;
        $min = 0;
        while ($j < $words) {
            if (count($linklist[$min]['id']) > count($linklist[$j]['id'])) {
                $min = $j;
            }
            $j++;
        }
        $j = 0;
        $temp_array = $linklist[$min]['id'];
        $count = 0;
        while ($j < count($temp_array)) {
            $k = 0;
            //and word counter
            $n = 0;
            //not word counter
            $o = 0;
            //phrase word counter
            if ($query_hits == '1') {
                $weight = 0;
            } else {
                $weight = 1;
            }
            $break = 0;
            if ($type == 'phrase' && $query_hits == '1') {
                // for PHRASE search: find out how often the phrase was found in fulltxt (not for weighting %  scores)
                while ($k < $words && $break == 0) {
                    if ($linklist[$k]['weight'][$temp_array[$j]] > 0) {
                        $weight = $linklist[$k]['weight'][$temp_array[$j]];
                    } else {
                        $break = 1;
                    }
                    $k++;
                }
            } else {
                // calculate weight for all other search modes
                while ($k < $words && $break == 0) {
                    if ($linklist[$k]['weight'][$temp_array[$j]] > 0) {
                        $weight = $weight + $linklist[$k]['weight'][$temp_array[$j]];
                    } else {
                        $break = 1;
                    }
                    $k++;
                }
            }
            while ($n < $not_words && $break == 0) {
                if ($notlist[$n]['id'][$temp_array[$j]] > 0) {
                    $break = 1;
                }
                $n++;
            }
            while ($o < $phrase_words && $break == 0) {
                if ($phraselist[$n]['id'][$temp_array[$j]] != 1) {
                    $break = 1;
                }
                $o++;
            }
            if ($break == 0 && $category > 0 && $category_list[$temp_array[$j]] != 1) {
                $break = 1;
            }
            if ($break == 0) {
                $result_array_full[$temp_array[$j]] = $weight;
                $count++;
            }
            $j++;
        }
    }
    //word == 1
    $end = getmicrotime() - $starttime;
    if ((count($result_array_full) == 0 || $possible_to_find == 0) && $did_you_mean_enabled == 1) {
        reset($searchstr['+']);
        foreach ($searchstr['+'] as $word) {
            $word2 = str_ireplace("Ã", "à", addslashes("{$word}"));
            $result = mysql_query("select keyword from " . $mysql_table_prefix . "keywords where soundex(keyword) = soundex('{$word2}%')");
            $max_distance = 100;
            $near_word = "";
            while ($row = mysql_fetch_row($result)) {
                $distance = levenshtein($row[0], $word);
                if ($distance < $max_distance && $distance < 10) {
                    $max_distance = $distance;
                    $near_word = $row[0];
                }
            }
            if ($near_word != "" && $word != $near_word) {
                $near_words[$word] = $near_word;
            }
        }
        $res['did_you_mean'] = $near_words;
        return $res;
    }
    if (count($result_array_full) == 0) {
        return null;
    }
    arsort($result_array_full);
    if ($sort_results == 4 && $domain_qry == "") {
        // output alla Google)
        while (list($key, $value) = each($result_array_full)) {
            if (!isset($domains_to_show[$domains[$key]])) {
                $result_array_temp[$key] = $value;
                $domains_to_show[$domains[$key]] = 1;
            } else {
                if ($domains_to_show[$domains[$key]] == 1) {
                    $domains_to_show[$domains[$key]] = array($key => $value);
                }
            }
        }
    } else {
        $result_array_temp = $result_array_full;
    }
    while (list($key, $value) = each($result_array_temp)) {
        $result_array[$key] = $value;
        if (isset($domains_to_show[$domains[$key]]) && $domains_to_show[$domains[$key]] != 1) {
            list($k, $v) = each($domains_to_show[$domains[$key]]);
            $result_array[$k] = $v;
        }
    }
    $keys = array_keys($result_array);
    $maxweight = $result_array[$keys[0]];
    $count = '0';
    foreach ($result_array as $row) {
        if ($query_hits == '0') {
            //    limit result output to min. relevance level
            $weight = number_format($row / $maxweight * 100, 0);
            if ($weight >= $relevance) {
                $count = $count + 1;
            }
        } else {
            if ($row > '0') {
                //      present results only if hits in full text
                $count = $count + 1;
            }
        }
    }
    if ($count != '0') {
        $result_array = array_chunk($result_array, $count, true);
        //      limit result output(weight > relevance level OR hits in fulltext > 0)
    }
    $result_array = $result_array[0];
    $results = count($result_array);
    for ($i = ($start - 1) * $per_page; $i < min($results, ($start - 1) * $per_page + $per_page); $i++) {
        $in[] = $keys[$i];
    }
    if (!is_array($in)) {
        $res['results'] = $results;
        return $res;
    }
    $inlist = implode(",", $in);
    if ($length_of_link_desc == 0) {
        $fulltxt = "fulltxt";
    } else {
        $fulltxt = "substring(fulltxt, 1, {$length_of_link_desc})";
    }
    $query1 = "SELECT distinct link_id, url, title, description,  {$fulltxt}, size, click_counter FROM " . $mysql_table_prefix . "links WHERE link_id in ({$inlist})";
    $result = mysql_query($query1);
    echo mysql_error();
    $i = 0;
    while ($row = mysql_fetch_row($result)) {
        $res[$i]['title'] = $row[2];
        $res[$i]['url'] = $row[1];
        if ($row[3] != null && $show_meta_description == 1) {
            $res[$i]['fulltxt'] = $row[3];
        } else {
            $res[$i]['fulltxt'] = $row[4];
        }
        $res[$i]['size'] = $row[5];
        $res[$i]['click_counter'] = $row[6];
        $res[$i]['weight'] = $result_array[$row[0]];
        $dom_result = mysql_query("select domain from " . $mysql_table_prefix . "domains where domain_id='" . $domains[$row[0]] . "'");
        $dom_row = mysql_fetch_row($dom_result);
        $res[$i]['domain'] = $dom_row[0];
        $urlparts = parse_url($res[$i]['url']);
        //$res[$i]['path'] = $urlparts['path'];    //      get full path
        $res[$i]['path'] = eregi_replace('([^/]+)$', "", $urlparts['path']);
        //      get path without filename
        $i++;
    }
    usort($res, "cmp_weight");
    //      standard output sorted by relevance (weight)
    $dom = $res[0]['domain'];
    if ($sort_results == '4' && $domain_qry == "" || $sort_results == '3') {
        //  output alla Google  OR  by domain name
        sort_with_domains($res);
    } else {
        if ($sort_results == '2') {
            //      enter here if 'Main URLs' on top of listing
            if ($dom == 'localhost') {
                //usort($res, "cmp_path_dot");
                //usort($res, "cmp_path_slash");
            } else {
                //usort($res, "cmp_dom_dot");     //      sort domains without dots on top
            }
        }
        if ($sort_results == '5') {
            //      enter here if 'Most Popular Click' on top of listing
            sort_by_bestclick($res);
        }
    }
    echo mysql_error();
    $res['maxweight'] = $maxweight;
    $res['results'] = $results;
    return $res;
}
Example #12
0
function tokenize($string, $stemWords = false)
{
    $string = trim(ereg_replace('[[:space:]]+', ' ', $string));
    $numChars = strlen($string);
    $marker = '';
    $startChar = 0;
    $endChar = 0;
    $current = 0;
    $arrayPos = 0;
    for ($i = 0; $i < $numChars; ++$i, ++$current) {
        if ($string[$i] == '\\' && $i > 0 && $string[$i - 1] != '\\') {
            ++$i;
        } else {
            if ($string[$i] == ' ') {
                if ($marker == '') {
                    $endChar = $current - 1;
                }
            } else {
                if ($string[$i] == '\'') {
                    if ($marker == '\'') {
                        if ($current - 1 == $endChar) {
                            ++$startChar;
                        }
                        $endChar = $current - 1;
                    } else {
                        if ($marker == '') {
                            if ($current > $startChar) {
                                $terms[$arrayPos] = substr($string, $startChar, $current - $startChar);
                                ++$arrayPos;
                                $startChar = $i + 1;
                            }
                            $marker = '\'';
                            $startChar = $current + 1;
                        }
                    }
                }
            }
        }
        $string[$current] = $string[$i];
        if ($endChar - $startChar > 1) {
            $temp = trim(substr($string, $startChar, $endChar - $startChar + 1));
            if ($stemWords) {
                $temp = stem($temp);
            }
            $terms[$arrayPos] = $temp;
            ++$arrayPos;
            $startChar = $current + 1;
            if ($marker != '') {
                ++$current;
                ++$startChar;
                ++$i;
            }
            $marker = '';
        }
    }
    if ($current - $startChar > 1) {
        $temp = trim(substr($string, $startChar, $current - $startChar));
        if ($stemWords) {
            $temp = stem($temp);
        }
        $terms[$arrayPos] = $temp;
    }
    return $terms;
}
Example #13
0
function unique_array($arr)
{
    global $min_word_length, $common, $word_upper_bound;
    global $index_numbers, $stem_words, $utf8, $case_sensitive;
    if ($stem_words == 1) {
        $newarr = array();
        foreach ($arr as $val) {
            $newarr[] = stem($val);
        }
        $arr = $newarr;
    }
    sort($arr);
    reset($arr);
    $newarr = array();
    $i = 0;
    $counter = 1;
    if ($case_sensitive == '0') {
        $element = lower_case(current($arr));
    } else {
        $element = current($arr);
    }
    if ($utf8 == '1') {
        //  build array with utf8 support
        if ($index_numbers == 0) {
            $pattern = "/[0-9]+/";
        } else {
            $pattern = "/[ ]+/";
        }
        $regs = array();
        for ($n = 0; $n < sizeof($arr); $n++) {
            //check if word is long enough, does not contain characters as defined in $pattern and is not a common word
            //to eliminate/count multiple instance of words
            $next_in_arr = next($arr);
            if (strlen($next_in_arr) >= $min_word_length) {
                if ($next_in_arr != $element) {
                    if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && @$common[$element] != 1) {
                        if (preg_match("/^(-|\\\\')(.*)/", $element, $regs)) {
                            $element = $regs[2];
                        }
                        if (preg_match("/(.*)(\\\\'|-)\$/", $element, $regs)) {
                            $element = $regs[1];
                        }
                        $newarr[$i][1] = $element;
                        $newarr[$i][2] = $counter;
                        if ($case_sensitive == '0') {
                            $element = lower_case(current($arr));
                        } else {
                            $element = current($arr);
                        }
                        $i++;
                        $counter = 1;
                    } else {
                        $element = $next_in_arr;
                    }
                } else {
                    if ($counter < $word_upper_bound) {
                        $counter++;
                    }
                }
            }
        }
    } else {
        //  build array without utf8 support
        if ($index_numbers == 1) {
            $pattern = "/[a-z0-9]+/";
        } else {
            $pattern = "/[a-z]+/";
        }
        $pattern2 = "/[a-z0-9]+/";
        // kill all non-alphanumerical characters
        $regs = array();
        for ($n = 0; $n < sizeof($arr); $n++) {
            //check if word is long enough, contains alphabetic characters and is not a common word
            //to eliminate/count multiple instance of words
            $next_in_arr = next($arr);
            if (strlen($next_in_arr) >= $min_word_length) {
                if ($next_in_arr != $element) {
                    if (strlen($element) >= $min_word_length && preg_match($pattern, remove_accents($element)) && preg_match($pattern2, $element) && @$common[$element] != 1) {
                        if (preg_match("/^(-|\\\\')(.*)/", $element, $regs)) {
                            $element = $regs[2];
                        }
                        if (preg_match("/(.*)(\\\\'|-)\$/", $element, $regs)) {
                            $element = $regs[1];
                        }
                        //print "element1: $element<br />";
                        //$element = quote_replace($element);
                        //$element = htmlentities($element);
                        //$element = html_entity_encode($element);
                        //print "element2: $element<br />";
                        //$newarr[$i][1] = html_entity_decode($element);  //  Sphider-plus likes it pure
                        $newarr[$i][1] = $element;
                        $newarr[$i][2] = $counter;
                        if ($case_sensitive == '0') {
                            $element = lower_case(current($arr));
                        } else {
                            $element = current($arr);
                        }
                        $i++;
                        $counter = 1;
                    } else {
                        $element = $next_in_arr;
                    }
                } else {
                    if ($counter < $word_upper_bound) {
                        $counter++;
                    }
                }
            }
        }
    }
    unset($element, $arr);
    //echo "<br>newArray:<br><pre>";print_r($newarr);echo "</pre>";
    return $newarr;
}
Example #14
0
 /**
  * Short description for 'stem_list'
  *
  * Long description (if any) ...
  *
  * @param      unknown $str Parameter description (if any) ...
  * @return     array Return description (if any) ...
  */
 private static function stem_list($str)
 {
     $stems = array();
     foreach (array_unique(preg_split('/\\s+/', trim($str))) as $word) {
         if (!\Components\Search\Models\Basic\DocumentMetadata::is_stop_word($word)) {
             $stems[] = stem(preg_replace('/[^[:alnum:]]/', '', $word));
         }
     }
     return $stems;
 }
Example #15
0
function sp_get_title_terms($text, $utf8, $use_stemmer)
{
    global $tinywords;
    if ($utf) {
        if ($use_stemmer) {
            mb_regex_encoding('UTF-8');
            mb_internal_encoding('UTF-8');
            $wordlist = mb_split("\\W+", sp_mb_clean_words($text));
            $roots = '';
            foreach ($wordlist as $word) {
                if (!isset($tinywords[$word])) {
                    $roots .= sp_mb_str_pad(stem($word), 4, '_') . ' ';
                }
            }
            return rtrim($roots);
        } else {
            mb_regex_encoding('UTF-8');
            mb_internal_encoding('UTF-8');
            $wordlist = mb_split("\\W+", sp_mb_clean_words($text));
            $words = '';
            foreach ($wordlist as $word) {
                if (!isset($tinywords[$word])) {
                    $words .= sp_mb_str_pad($word, 4, '_') . ' ';
                }
            }
            return rtrim($words);
        }
    } else {
        if ($use_stemmer) {
            $wordlist = str_word_count(sp_clean_words($text), 1);
            $roots = '';
            foreach ($wordlist as $word) {
                if (!isset($tinywords[$word])) {
                    $roots .= str_pad(stem($word), 4, '_') . ' ';
                }
            }
            return rtrim($roots);
        } else {
            $wordlist = str_word_count(sp_clean_words($text), 1);
            $words = '';
            foreach ($wordlist as $word) {
                if (!isset($tinywords[$word])) {
                    $words .= str_pad($word, 4, '_') . ' ';
                }
            }
            return rtrim($words);
        }
    }
}