Beispiel #1
0
function get_keywords($text)
{
    if (($token = strtok(phpdigEpureText($text), ' '))) $nbre_mots[$token] = 1;

    while (($token = strtok(' ')))
        $nbre_mots[$token] = ($nm = $nbre_mots[$token]) ? $nm + 1 : 1;

    return $nbre_mots;
}
function phpdigIndexFile($id_connect, $tempfile, $tempfilesize, $site_id, $origine, $localdomain, $path, $file, $content_type, $upddate, $last_modified, $tags, $ftp_id = '')
{
    //globals
    global $allowed_link_chars, $phpdig_words_chars, $common_words, $relative_script_path, $s_yes, $s_no, $br;
    //current_date
    $date = date("YmdHis", time());
    //settype($tempfile,'string');
    if (!isset($tempfile) || !is_file($tempfile)) {
        return 0;
    }
    settype($page_desc, 'string');
    settype($page_keywords, 'string');
    if (APPEND_TITLE_META) {
        if (is_array($tags)) {
            if (isset($tags['description'])) {
                $page_desc = phpdigCleanHtml($tags['description']);
            }
            if (isset($tags['keywords'])) {
                $page_keywords = phpdigCleanHtml($tags['keywords']);
            }
        }
    }
    $file_content = file($tempfile);
    $textalts = "";
    //verify the array $text is empty
    $n_chunk = 0;
    $n_cline = 0;
    $text[0] = '';
    $exclude = false;
    foreach ($file_content as $num => $line) {
        if (trim($line)) {
            if ($content_type == 'HTML' && trim($line) == PHPDIG_EXCLUDE_COMMENT) {
                $exclude = true;
            } else {
                if (trim($line) == PHPDIG_INCLUDE_COMMENT) {
                    $exclude = false;
                    continue;
                }
            }
            if (!$exclude) {
                //extract alt attributes of images
                if (eregi("(alt=|title=)[[:blank:]]*[\\'\"][[:blank:]]*([ a-z0-9È-Ë]+)[[:blank:]]*[\\'\"]", $line, $regs)) {
                    $textalts .= $regs[2];
                }
                //extract the domains names not local and not banned to add in keywords
                while (eregi("<a([^>]*href[[:blank:]]*=[[:blank:]]*[\\'\"]?((([a-z]{3,5}://)+(([.a-zA-Z0-9-])+(:[0-9]+)*))*({$allowed_link_chars}\\[?{$allowed_link_chars}\\]?{$allowed_link_chars}))(#[.a-zA-Z0-9-]*)?[\\'\" ]?)", $line, $regs)) {
                    $line = str_replace($regs[1], "", $line);
                    if ($regs[5] && $regs[5] != $localdomain && !eregi(BANNED, $regs[2]) && ereg('[a-z]+', $regs[5])) {
                        if (!isset($nbre_mots[$regs[5]])) {
                            $nbre_mots[$regs[5]] = 1;
                        } else {
                            $nbre_mots[$regs[5]]++;
                        }
                    }
                }
                $n_cline++;
                //cut the text after $n_chunk characters
                if (strlen($text[$n_chunk]) > CHUNK_SIZE) {
                    //cut only before an opening tag
                    if ($content_type != 'HTML' or eregi("^[[:blank:]]*<[a-z]+[^>]*>", $line)) {
                        $n_cline = 0;
                        $n_chunk++;
                        $text[$n_chunk] = " ";
                    }
                }
                $text[$n_chunk] .= trim($line) . " ";
            }
        }
    }
    //store the number of chunks
    $max_chunk = $n_chunk;
    //free the array containing file content
    if (isset($file_content)) {
        unset($file_content);
    }
    $doc_title = "";
    //purify from html tags and store the title
    if (is_array($text) && $content_type == 'HTML') {
        foreach ($text as $n_chunk => $chunk) {
            $chunk = phpdigCleanHtml($chunk);
            $text[$n_chunk] = trim($chunk['content']) . " ";
            $doc_title .= $chunk['title'];
        }
    }
    //set the title in order <title>, filename, or unknown
    if (isset($doc_title) && $doc_title) {
        $titre_resume = $doc_title;
    } elseif (isset($file) && $file) {
        $titre_resume = $file;
    } else {
        $titre_resume = "Untitled";
    }
    //title and small description
    if (!is_array($page_desc)) {
        $page_desc['content'] = '';
    } else {
        $page_desc['content'] = ' ' . $page_desc['content'];
    }
    $db_some_text = preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", implode("", $text));
    if (strlen($db_some_text) > SUMMARY_DISPLAY_LENGTH) {
        $db_some_text = substr($db_some_text, 0, SUMMARY_DISPLAY_LENGTH) . "...";
    }
    $first_words = preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", $titre_resume) . "\n" . preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", $page_desc['content'] . $db_some_text) . "...";
    //hashed string to detect doubles
    $md5 = md5($titre_resume . $page_desc['content'] . $text[$max_chunk]) . '_' . $tempfilesize;
    //double test :
    $phpdigTestDouble = phpdigTestDouble($id_connect, $site_id, $md5, $upddate, $last_modified);
    //if no double detected, continue indexing
    if ($phpdigTestDouble == 0) {
        $text_title = "";
        //weight of title and description is there
        if (APPEND_TITLE_META) {
            for ($itl = 0; $itl < TITLE_WEIGHT; $itl++) {
                $text_title .= $doc_title . " " . $page_desc['content'] . " ";
            }
            $add_text = $text_title;
            if (is_array($textalts) && isset($textalts['content'])) {
                $add_text .= $textalts['content'];
            }
            if (is_array($page_keywords) && isset($page_keywords['content'])) {
                $add_text .= " " . $page_keywords['content'];
            }
            array_push($text, $add_text);
        }
        //words list and occurence of each of them
        $total = 0;
        foreach ($text as $n_chunk => $text2) {
            $text2 = phpdigEpureText($text2, SMALL_WORDS_SIZE);
            $separators = " ";
            if (isset($token)) {
                unset($token);
            }
            for ($token = strtok($text2, $separators); $token !== FALSE; $token = strtok($separators)) {
                if (!isset($nbre_mots[$token])) {
                    $nbre_mots[$token] = 1;
                } else {
                    $nbre_mots[$token]++;
                }
                $total++;
            }
        }
        $distinct_words = @count($nbre_mots);
        //modify the spider reccord
        $spider_id = phpdigUpdSpiderRow($id_connect, $site_id, $path, $file, $first_words, $upddate, $md5, $last_modified, $distinct_words, $tempfilesize);
        //here store extract the textual content (return a new ftp_id in case of reconnection)
        $ftp_id = phpdigWriteText($relative_script_path, $spider_id, $text, $ftp_id);
        //end of textual.
        //delete old engine reccord
        $query = "DELETE FROM " . PHPDIG_DB_PREFIX . "engine WHERE spider_id={$spider_id}";
        mysql_query($query, $id_connect);
        //database insert
        $it = 0;
        $sqlvalues = "";
        while (list($key, $value) = @each($nbre_mots)) {
            $key = trim($key);
            if (!get_magic_quotes_runtime()) {
                $key = addslashes($key);
            }
            //no small words nor stop words
            if (strlen($key) > SMALL_WORDS_SIZE and strlen($key) <= MAX_WORDS_SIZE and !isset($common_words[$key]) and ereg('^[' . $phpdig_words_chars[PHPDIG_ENCODING] . '#$]', $key)) {
                //if keyword exists, retrieve id, else insert it
                $requete = "SELECT key_id FROM " . PHPDIG_DB_PREFIX . "keywords WHERE keyword = '" . $key . "'";
                $result_insert = mysql_query($requete, $id_connect);
                $num = mysql_num_rows($result_insert);
                if ($num == 0) {
                    //inserts new keyword
                    $requete = "INSERT INTO " . PHPDIG_DB_PREFIX . "keywords (keyword,twoletters) VALUES ('" . $key . "','" . addslashes(substr(str_replace('\\', '', $key), 0, 2)) . "')";
                    mysql_query($requete, $id_connect);
                    $key_id = mysql_insert_id($id_connect);
                } else {
                    //existing keyword
                    $keyid = mysql_fetch_row($result_insert);
                    mysql_free_result($result_insert);
                    $key_id = $keyid[0];
                }
                //New index record
                if ($it == 0) {
                    $sqlvalues .= "({$spider_id},{$key_id},{$value})";
                    $it = 1;
                } else {
                    $sqlvalues .= ",\n({$spider_id},{$key_id},{$value})";
                }
            }
        }
        if (isset($nbre_mots)) {
            unset($nbre_mots);
        }
        //One query for the entire page
        $requete = "INSERT INTO " . PHPDIG_DB_PREFIX . "engine (spider_id,key_id, weight) VALUES {$sqlvalues}\n";
        $result_insert = mysql_query($requete, $id_connect);
        print $s_yes;
    } else {
        $spider_id = -1;
        print $s_no . phpdigMsg('double') . $br;
    }
    if (isset($text)) {
        unset($text);
    }
    return $spider_id;
}