function get_keywords($text) { if (($token = strtok(phpdigEpureText($text), ' '))) $nbre_mots[$token] = 1; while (($token = strtok(' '))) $nbre_mots[$token] = ($nm = $nbre_mots[$token]) ? $nm + 1 : 1; return $nbre_mots; }
function phpdigIndexFile($id_connect, $tempfile, $tempfilesize, $site_id, $origine, $localdomain, $path, $file, $content_type, $upddate, $last_modified, $tags, $ftp_id = '') { //globals global $allowed_link_chars, $phpdig_words_chars, $common_words, $relative_script_path, $s_yes, $s_no, $br; //current_date $date = date("YmdHis", time()); //settype($tempfile,'string'); if (!isset($tempfile) || !is_file($tempfile)) { return 0; } settype($page_desc, 'string'); settype($page_keywords, 'string'); if (APPEND_TITLE_META) { if (is_array($tags)) { if (isset($tags['description'])) { $page_desc = phpdigCleanHtml($tags['description']); } if (isset($tags['keywords'])) { $page_keywords = phpdigCleanHtml($tags['keywords']); } } } $file_content = file($tempfile); $textalts = ""; //verify the array $text is empty $n_chunk = 0; $n_cline = 0; $text[0] = ''; $exclude = false; foreach ($file_content as $num => $line) { if (trim($line)) { if ($content_type == 'HTML' && trim($line) == PHPDIG_EXCLUDE_COMMENT) { $exclude = true; } else { if (trim($line) == PHPDIG_INCLUDE_COMMENT) { $exclude = false; continue; } } if (!$exclude) { //extract alt attributes of images if (eregi("(alt=|title=)[[:blank:]]*[\\'\"][[:blank:]]*([ a-z0-9È-Ë]+)[[:blank:]]*[\\'\"]", $line, $regs)) { $textalts .= $regs[2]; } //extract the domains names not local and not banned to add in keywords while (eregi("<a([^>]*href[[:blank:]]*=[[:blank:]]*[\\'\"]?((([a-z]{3,5}://)+(([.a-zA-Z0-9-])+(:[0-9]+)*))*({$allowed_link_chars}\\[?{$allowed_link_chars}\\]?{$allowed_link_chars}))(#[.a-zA-Z0-9-]*)?[\\'\" ]?)", $line, $regs)) { $line = str_replace($regs[1], "", $line); if ($regs[5] && $regs[5] != $localdomain && !eregi(BANNED, $regs[2]) && ereg('[a-z]+', $regs[5])) { if (!isset($nbre_mots[$regs[5]])) { $nbre_mots[$regs[5]] = 1; } else { $nbre_mots[$regs[5]]++; } } } $n_cline++; //cut the text after $n_chunk characters if (strlen($text[$n_chunk]) > CHUNK_SIZE) { //cut only before an opening tag if ($content_type != 'HTML' or eregi("^[[:blank:]]*<[a-z]+[^>]*>", $line)) { $n_cline = 0; $n_chunk++; $text[$n_chunk] = " "; } } $text[$n_chunk] .= trim($line) . " "; } } } //store the number of chunks $max_chunk = $n_chunk; //free the array containing file content if (isset($file_content)) { unset($file_content); } $doc_title = ""; //purify from html tags and store the title if (is_array($text) && $content_type == 'HTML') { foreach ($text as $n_chunk => $chunk) { $chunk = phpdigCleanHtml($chunk); $text[$n_chunk] = trim($chunk['content']) . " "; $doc_title .= $chunk['title']; } } //set the title in order <title>, filename, or unknown if (isset($doc_title) && $doc_title) { $titre_resume = $doc_title; } elseif (isset($file) && $file) { $titre_resume = $file; } else { $titre_resume = "Untitled"; } //title and small description if (!is_array($page_desc)) { $page_desc['content'] = ''; } else { $page_desc['content'] = ' ' . $page_desc['content']; } $db_some_text = preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", implode("", $text)); if (strlen($db_some_text) > SUMMARY_DISPLAY_LENGTH) { $db_some_text = substr($db_some_text, 0, SUMMARY_DISPLAY_LENGTH) . "..."; } $first_words = preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", $titre_resume) . "\n" . preg_replace("/([ ]{2,}|\n|\r|\r\n)/", " ", $page_desc['content'] . $db_some_text) . "..."; //hashed string to detect doubles $md5 = md5($titre_resume . $page_desc['content'] . $text[$max_chunk]) . '_' . $tempfilesize; //double test : $phpdigTestDouble = phpdigTestDouble($id_connect, $site_id, $md5, $upddate, $last_modified); //if no double detected, continue indexing if ($phpdigTestDouble == 0) { $text_title = ""; //weight of title and description is there if (APPEND_TITLE_META) { for ($itl = 0; $itl < TITLE_WEIGHT; $itl++) { $text_title .= $doc_title . " " . $page_desc['content'] . " "; } $add_text = $text_title; if (is_array($textalts) && isset($textalts['content'])) { $add_text .= $textalts['content']; } if (is_array($page_keywords) && isset($page_keywords['content'])) { $add_text .= " " . $page_keywords['content']; } array_push($text, $add_text); } //words list and occurence of each of them $total = 0; foreach ($text as $n_chunk => $text2) { $text2 = phpdigEpureText($text2, SMALL_WORDS_SIZE); $separators = " "; if (isset($token)) { unset($token); } for ($token = strtok($text2, $separators); $token !== FALSE; $token = strtok($separators)) { if (!isset($nbre_mots[$token])) { $nbre_mots[$token] = 1; } else { $nbre_mots[$token]++; } $total++; } } $distinct_words = @count($nbre_mots); //modify the spider reccord $spider_id = phpdigUpdSpiderRow($id_connect, $site_id, $path, $file, $first_words, $upddate, $md5, $last_modified, $distinct_words, $tempfilesize); //here store extract the textual content (return a new ftp_id in case of reconnection) $ftp_id = phpdigWriteText($relative_script_path, $spider_id, $text, $ftp_id); //end of textual. //delete old engine reccord $query = "DELETE FROM " . PHPDIG_DB_PREFIX . "engine WHERE spider_id={$spider_id}"; mysql_query($query, $id_connect); //database insert $it = 0; $sqlvalues = ""; while (list($key, $value) = @each($nbre_mots)) { $key = trim($key); if (!get_magic_quotes_runtime()) { $key = addslashes($key); } //no small words nor stop words if (strlen($key) > SMALL_WORDS_SIZE and strlen($key) <= MAX_WORDS_SIZE and !isset($common_words[$key]) and ereg('^[' . $phpdig_words_chars[PHPDIG_ENCODING] . '#$]', $key)) { //if keyword exists, retrieve id, else insert it $requete = "SELECT key_id FROM " . PHPDIG_DB_PREFIX . "keywords WHERE keyword = '" . $key . "'"; $result_insert = mysql_query($requete, $id_connect); $num = mysql_num_rows($result_insert); if ($num == 0) { //inserts new keyword $requete = "INSERT INTO " . PHPDIG_DB_PREFIX . "keywords (keyword,twoletters) VALUES ('" . $key . "','" . addslashes(substr(str_replace('\\', '', $key), 0, 2)) . "')"; mysql_query($requete, $id_connect); $key_id = mysql_insert_id($id_connect); } else { //existing keyword $keyid = mysql_fetch_row($result_insert); mysql_free_result($result_insert); $key_id = $keyid[0]; } //New index record if ($it == 0) { $sqlvalues .= "({$spider_id},{$key_id},{$value})"; $it = 1; } else { $sqlvalues .= ",\n({$spider_id},{$key_id},{$value})"; } } } if (isset($nbre_mots)) { unset($nbre_mots); } //One query for the entire page $requete = "INSERT INTO " . PHPDIG_DB_PREFIX . "engine (spider_id,key_id, weight) VALUES {$sqlvalues}\n"; $result_insert = mysql_query($requete, $id_connect); print $s_yes; } else { $spider_id = -1; print $s_no . phpdigMsg('double') . $br; } if (isset($text)) { unset($text); } return $spider_id; }