function clean_file($file, $url, $type) { global $entities, $index_host, $index_meta_keywords; $index_meta_keywords = 1; $index_host = 0; $urlparts = parse_url($url); $host = $urlparts['host']; //remove filename from path $path = eregi_replace('([^/]+)$', "", $urlparts['path']); $file = preg_replace("/<link rel[^<>]*>/i", " ", $file); $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file); $file = preg_replace("@<!--.*?-->@si", " ", $file); $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file); $headdata = get_head_data($file); $regs = array(); if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) { $title = trim($regs[1]); $file = str_replace($regs[0], "", $file); } else { if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words $title = substr($file, 0, strrpos(substr($file, 0, 40), " ")); } } $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file); //create spaces between tags, so that removing tags doesnt concatenate strings $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file); $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file); $file = strip_tags($file); $file = preg_replace("/ /", " ", $file); $fulltext = $file; $file .= " " . $title; if ($index_host == 1) { $file = $file . " " . $host . " " . $path; } if ($index_meta_keywords == 1) { $file = $file . " " . $headdata['keywords']; } //replace codes with ascii chars $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file); $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file); $file = strtolower($file); reset($entities); while ($char = each($entities)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); } $file = preg_replace("/&[a-z]{1,6};/", " ", $file); $file = preg_replace("/[\\*\\^\\+\\?\\\\.\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\/@#?%&=`?><:,]+/", " ", $file); $file = preg_replace("/\\s+/", " ", $file); $data['fulltext'] = addslashes($fulltext); $data['content'] = addslashes($file); $data['title'] = addslashes($title); $data['description'] = $headdata['description']; $data['keywords'] = $headdata['keywords']; $data['host'] = $host; $data['path'] = $path; $data['nofollow'] = $headdata['nofollow']; $data['noindex'] = $headdata['noindex']; $data['base'] = $headdata['base']; return $data; }
function clean_file($file, $url, $type, $charSet, $use_nofollow, $use_robot, $can_leave_domain) { global $db_con, $entities, $index_host, $index_meta_keywords, $index_meta_description, $case_sensitive, $utf_16; global $home_charset, $chrSet, $del_secchars, $index_rss, $converter_dir, $div_all, $div_hyphen, $del_dups; global $bb_decode, $ent_decode, $cn_seg, $quotes, $dup_quotes, $clear, $only_links, $text_length, $strict_high; global $use_divs, $not_divs, $not_divlist, $use_divlist, $ignore_fulltxt, $index_meta_title, $js_reloc; global $use_elems, $not_elems, $use_elementslist, $not_elementslist, $del_elems, $conv_puny, $include_dir; $new = array(); $data = array(); $string = ''; $home_charset = strtoupper($home_charset); if ($utf_16) { //$file = mb_ereg_replace("\\0", "", $file); $file = utf16_to_utf8($file); } // kill useless blanks, under scores and line feeds $file = preg_replace("/[ |\r\n|\\_]+/i", " ", $file); $urlparts = parse_addr($url); $host = $urlparts['host']; //remove filename from path and all tags which should be ignored $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']); if ($use_nofollow == '1') { $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file); } // parse the HTML head $headdata = get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain, $type); $title = $headdata['title']; $description = $headdata['description']; $keywords = $headdata['keywords']; $file = preg_replace("@<head>.*?</head>@si", " ", $file); // remove HTML head from file $file = preg_replace("@<!--.*?-->@si", " ", $file); $file = preg_replace("@<script[^>]*?>.*?<\\/script>@si", " ", $file); //$file = str_replace("window.location.replace", " ", $file); $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file); $file = preg_replace("/<link rel[^<>]*>/i", " ", $file); $file = preg_replace("@<div style=(\"|')display\\:none(\"|').*?<\\/div>@si", " ", $file); $file = preg_replace("@<a.*?>@si", " ", $file); $file = preg_replace("@<(object|img|audio|video).*?>@si", " ", $file); $file = preg_replace("@<(align|alt|data|body|form|height|input|id|name|span|src|table|td|type|width|layer|span).*?>@si", " ", $file); $file = preg_replace("@\\{document\\..*?\\}@si", " ", $file); // if activated in Admin settings, ignore the full text if ($ignore_fulltxt == '1') { $file = ''; } // if activated in Admin settings, remove all div contents as defined in common 'divs_not' list if ($not_divs == '1') { // JFIELD parse the doc into a DOM tree so we can // do cool stuff like exclude certain divs // echo "<pre>\n"; global $myFile; $myFile = ""; $myDepth = 0; $dom = new DOMDocument(); $dom->loadHTML($file); /* // JFIELD figuring out image exclusion and stuff echo "<pre>HEY:\n"; echo "$title\n"; echo "$description\n"; echo "$url\n"; //var_dump( $dom->getElementById('shared-image-desc') ); echo "</pre>\n"; */ // probably a better way to get the doc // than skipping over the non-doc like this foreach ($dom->childNodes as $item) { if (!$item->tagName) { continue; } recurseNodes($item); } $file = $myFile; // wikipedia - don't index content of image pages if (preg_match("/\\/images\\//", $url) && preg_match("/^File\\:/", $title)) { // image description stopwords $mystopwords = explode(" ", "wikipedia schools english featured article sos children file"); $mydesc = str_replace($mystopwords, " ", strtolower($description)); $description = ''; // otherwise the unstripped version gets used $title = str_replace("File:", "", $title); // remove this noise $file = "{$title} {$mydesc} picture image"; } // echo "</pre>\n"; // END JFIELD } // if activated in Admin settings, fetch all div contents as defined in common 'divs_use' list if ($use_divs == '1') { foreach ($use_divlist as $thisid) { // try to find divs with id as specified in common 'divs' list // regexp ? if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid) - 1) { $thisid = substr($thisid, 2, strlen($thisid) - 3); // remove the regex capsules } else { // for string input only if (strrpos($thisid, "*") == strlen($thisid) - 1) { $thisid = str_replace("*", "(.*?)", $thisid); // replace wildcards at the end of string input } } if (preg_match_all("@(<div class|<div id)=(\"|')" . $thisid . "(\"|').*?(</div>)@si", $file, $found_divs, PREG_OFFSET_CAPTURE)) { foreach ($found_divs[0] as $another_div) { // walk through all found divs. Usually W3C does not allow more than one div with this id. But who knows . . . . $this_divstart = $another_div[1]; // get actual startpos from div-array $i = "end"; // if required $i will become the loop counter for nested divs $nextstart = strpos($file, "<div", $this_divstart + 4); // find start pos of next div $nextend = strpos($file, "</div", $this_divstart + 4); // find end pos of next div //check for nested divs $start1 = strpos($file, "<div", $nextstart + 4); // find start pos of next div if ($start1 && $start1 < $nextend) { $i = "0"; // yes, nested } while ($i != "end") { // loop for (multiple) 'nested divs' $i = '0'; while ($nextstart && $nextstart < $nextend) { // next div is a nested div? $nextend1 = strpos($file, "</div", $nextstart + 4); // this is only the endpos of current div $nextend = strpos($file, "</div", $nextend1 + 6); // find end pos of next div $nextstart = strpos($file, "<div", $nextstart + 4); // find start pos of next div if ($nextstart && $nextstart < $nextend1) { // again nested in next layer? $i++; // counter for next level nested divs } } // if nested divs were found, correct end pos of div to be deleted while ($i > '1') { $nextend = strpos($file, "</div", $nextend + 6); $i--; } $nextend1 = strpos($file, "</div", $nextend + 6); // $nextend from former div (might have been nested) if ($nextend1) { $nextend = $nextend1; // defines next endpos } if (!$nextstart || $nextend < $nextstart) { $i = 'end'; // no longer nested divs } } // collect all divs to be indexed $all_divs[] = substr($file, $this_divstart, $nextend + 6 - $this_divstart); } // add content of all found divs to full text foreach ($all_divs as $use_thisdiv) { $divfile .= " " . $use_thisdiv; } } } $file = $divfile; // now this will be used as the body part of the page content } // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_use' list and use the content of these elements as page content if ($use_elems == '1') { foreach ($use_elementslist as $this_element) { // try to find elements with id as specified in common 'elöements_use' list // regexp ? if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element) - 1) { $this_element = substr($this_element, 2, strlen($this_element) - 3); // remove the regex capsules } if (preg_match_all("@<{$this_element}.*?>.*?<\\/{$this_element}>@si", $file, $found_elements, PREG_OFFSET_CAPTURE)) { foreach ($found_elements as $new_element) { // walk through all found elementss. foreach ($new_element as $new) { // build substring without content tags $string = $new[0]; $string = substr($string, strpos($string, ">") + 1); $string = substr($string, 0, strrpos($string, "<")); // collect all elements to be indexed $all_elements[] = $string; } } } } $file = ''; // add content of all found elements to full text foreach ($all_elements as $use_thiselem) { $file .= " " . $use_thiselem; // now all this will be used as the body part of the page content } } // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_not' list and delete that part of the page if ($not_elems == '1') { foreach ($not_elementslist as $this_element) { // try to find elements with id as specified in common 'elements_not' list // regexp ? if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element) - 1) { $this_element = substr($this_element, 2, strlen($this_element) - 3); // remove the regex capsules } if (preg_match_all("@<{$this_element}.*?>.*?<\\/{$this_element}>@si", $file, $found_elements, PREG_OFFSET_CAPTURE)) { foreach ($found_elements as $new_element) { // walk through all found elementss. foreach ($new_element as $new) { // collect all elements to be ignored $all_elements[] = $new[0]; } } } } // remove the content of all found elements from full text foreach ($all_elements as $use_thiselem) { $file = str_replace($use_thiselem, " ", $file); } } // parse bbcode if ($bb_decode == '1') { $file = bbcode($file); } $file = preg_replace("@<div.*?>@si", " ", $file); $file = preg_replace("@<\\/.*?>@si", " ", $file); //create spaces between tags, so that removing tags doesnt concatenate strings $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file); $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file); $file = preg_replace("@<\\/a>@si", " ", $file); // remove lost end tag //$file = strip_tags($file); // remove the content of HTML tags from $file (does not work for invalid written and unclosed tags) // replaced since Sphider-plus version 2.7 // remove the content of HTML tags from $file $found_tags = array(); $another_tag = array(); if (preg_match_all("@<.*?>@s", $file, $found_tags, PREG_OFFSET_CAPTURE)) { foreach ($found_tags[0] as $another_tag) { // walk through all found tags. if (strlen($another_tag[0]) < "500") { // delete this tag from full text if not too long (unclosed) $file = str_replace($another_tag[0], " ", $file); } } } if ($del_elems) { // if activated in Admin backend, delete < element /> from full text $found_tags = array(); $another_tag = array(); if (preg_match_all("@\\<.*?\\>@s", $file, $found_tags, PREG_OFFSET_CAPTURE)) { foreach ($found_tags[0] as $another_tag) { // walk through all found tags. $file = str_replace($another_tag[0], " ", $file); } } } if ($conv_puny) { // make punycode readable require_once "{$include_dir}/idna_converter.php"; // Initialize the converter class $IDN = new idna_convert(array('idn_version' => 2008)); $found_tags = array(); $another_tag = array(); $this_tag = ''; $file = str_replace("http", " http", $file); //place a blank in front of all http's if (preg_match_all("@http.*? @s", $file, $found_tags, PREG_OFFSET_CAPTURE)) { foreach ($found_tags[0] as $another_tag) { // walk through all found tags. // Decode the URL to readable format $this_tag = $IDN->decode(rawurldecode($another_tag[0])); $this_tag = rawurldecode($this_tag); $file = str_replace($another_tag[0], $this_tag, $file); } } } $file = str_replace(" ", " ", $file); // replace special (long) blanks with standard blank $file = str_replace("â€â€ÂÂ", "'", $file); // replace invalid coded quotations $file = str_replace("©", "—", $file); // replace invalid coded long dash with correct long dash $file = preg_replace("/ +/", " ", $file); // replace TABs with a standard blank $file = preg_replace("/ +/", " ", $file); // kill duplicate blanks $file = preg_replace("/__+/", " ", $file); // kill duplicate underscore $file = preg_replace("/--+/", " ", $file); // kill duplicate hyphens $file = preg_replace("/\\*\\*+/", " ", $file); // kill duplicate stars $file = preg_replace("/\\#\\#+/", " ", $file); // kill duplicate hash tags $file = str_replace(" ", " ", $file); $file = str_replace(" ", " ", $file); // kill duplicate blanks $file = str_replace("­", "", $file); // kill break character $file = preg_replace("/\\☨\\☨+/", " ", $file); // kill duplicates. . . Yes, I've met something $file = preg_replace("/\\(\\(+/", " ", $file); // kill duplicates. . . no comment $file = preg_replace("/\\<\\<+/", " ", $file); // kill duplicates $file = preg_replace("/\\>\\>+/", " ", $file); // kill duplicates $file = preg_replace("/\\*\\~+/", " ", $file); // kill duplicates $file = preg_replace("/\\+\\++/", " ", $file); // kill duplicates $file = preg_replace("/\\=\\=+/", " ", $file); // kill duplicates $file = preg_replace("/\\~\\~+/", " ", $file); // kill duplicates // kill some other duplicates, already met on the Internet if ($del_dups) { $file = preg_replace("/\\(\\(+/", " ", $file); $file = preg_replace("/\\)\\)+/", " ", $file); $file = preg_replace("/\\~\\~+/", " ", $file); $file = preg_replace("/\\=\\=+/", " ", $file); $file = preg_replace("/\\?\\?+/", " ", $file); $file = preg_replace("/\\!\\!+/", " ", $file); $file = preg_replace("/\\.\\.+/", " ", $file); $file = preg_replace("/\\<\\<+/", " ", $file); $file = preg_replace("/\\>\\>+/", " ", $file); $file = preg_replace("/\\:\\:+/", " ", $file); $file = preg_replace("/\\+\\++/", " ", $file); $file = preg_replace("/\\-\\-+/", " ", $file); $file = preg_replace("/\\*\\*+/", " ", $file); } $file = str_replace(" ", " ", $file); $file = str_replace(" ", " ", $file); // kill duplicate blanks $file = str_replace("­", "", $file); // kill break character // kill some special cases $file = str_replace(""", "\"", $file); $file = str_replace("…", " ", $file); if ($text_length != "0") { // build substring of full text until last space in front of $text_length $file = substr($file, 0, strrpos(substr($file, 0, $text_length), " ")); } if ($index_host == 1) { // separate words in host and path $host_sep = preg_replace("/\\.|\\/|\\\\/", " ", $host); $path_sep = preg_replace("/\\.|\\/|\\\\/", " ", $path); $file = $file . " " . $host . " " . $host_sep; $file = $file . " " . $path . " " . $path_sep; } if ($headdata['title'] && $index_meta_title) { $file = $file . " " . $title; } if ($index_meta_description == 1) { $file = $file . " " . $description; } if ($index_meta_keywords == 1) { $file = $file . " " . $keywords; } if ($ent_decode == '1') { // as it seems, the PHP function html_entity_decode() has some problems. // In case that 2 entities are placed directly together like: — // we are obliged to be helpful by eliminating one of them $file = str_replace(" ", " ", $file); // now PHP does not get confused $file = html_entity_decode($file, ENT_QUOTES, 'UTF-8'); $title = str_replace(" ", " ", $title); $title = html_entity_decode($title, ENT_QUOTES, 'UTF-8'); } // correct some other trash found on the Internet $file = str_replace("�", "fi", $file); $file = str_replace("fl", "fl", $file); // for URLs use entities, so that links become readable in full text $file = str_replace("<a href=\"http://www.", "<a href="http://www.", $file); // replace .. with a standard blank $file = str_replace("...", " ", $file); // kill duplicate blanks " ", \r, \t, \n and \f if (preg_match("@8859|utf@", $charSet)) { $file = preg_replace("/[\\s,]+/", " ", $file); } if ($index_rss == '1') { $file = preg_replace('/0b/si', '.', $file); // try to correct bad charset interpretation $file = preg_replace('//si', '\'', $file); $trash = array("\r\n", "\n", "\r", "0E", "0C", "0I"); // kill 'LF' and the others } else { $trash = array("\r\n", "\f", "\n", "\r", "\t"); } $replace = ' '; $file = str_replace($trash, $replace, $file); $fulltext = $file; // required for result listing as extract around the keywords and for PHRASE search if ($del_secchars) { $file = del_secchars($file); } // use the cleaned $file to just highlight the pure query term in result listing if ($strict_high) { $fulltext = $file; } // convert all single quotes into standard quote if ($quotes == '1') { $all_quotes = array("‘" => "'", "‘" => "'", "’" => "'", "’" => "'", "′" => "'", "′" => "'", "‘" => "'", "‘" => "'", "´" => "'", "`" => "'", "’" => "'", "‘" => "'", "’" => "'", "’" => "'"); //reset($all_quotes); while ($char = each($all_quotes)) { $file = preg_replace("/" . $char[0] . "/si", $char[1], $file); $title = preg_replace("/" . $char[0] . "/si", $char[1], $title); $description = preg_replace("/" . $char[0] . "/si", $char[1], $description); $keywords = preg_replace("/" . $char[0] . "/si", $char[1], $keywords); } } // convert all double quotes into standard quotations if ($dup_quotes == '1') { $all_quotes = array("“" => "\"", "�" => "\"", "„" => "\""); reset($all_quotes); while ($char = each($all_quotes)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); $title = preg_replace("/" . $char[0] . "/i", $char[1], $title); $description = preg_replace("/" . $char[0] . "/i", $char[1], $description); $keywords = preg_replace("/" . $char[0] . "/i", $char[1], $keywords); } } // split words at hyphen, single quote, dot and comma into their basics if ($div_all || $div_hyphen) { $file = split_words($file); // jfield: yes, we want to index words split on hyphens, // but what does that have to do with changing the appearance // of the title? sphider_plus, you so crazy // $title = split_words($title); $description = split_words($description); $keywords = split_words($keywords); } reset($entities); while ($char = each($entities)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); $title = preg_replace("/" . $char[0] . "/i", $char[1], $title); $description = preg_replace("/" . $char[0] . "/i", $char[1], $description); $keywords = preg_replace("/" . $char[0] . "/i", $char[1], $keywords); } // replace special (long) blanks in title $title = str_replace(" ", " ", $title); //remove all the fancy jokes some webmasters add $title = preg_replace("@<(.*?)>@si", "", $title); $title = preg_replace("@ +@si", " ", $title); // replace TABs with a standard blank $fulltext = preg_replace("/ +/", " ", $fulltext); $count = count(preg_split("/[\\s,]+/", $fulltext)); $data['fulltext'] = $db_con->real_escape_string($fulltext); $data['content'] = $file; $data['title'] = $db_con->real_escape_string($title); $data['description'] = $db_con->real_escape_string($description); $data['keywords'] = $db_con->real_escape_string($keywords); $data['host'] = $host; $data['path'] = $path; $data['nofollow'] = $headdata['nofollow']; $data['noindex'] = $headdata['noindex']; $data['base'] = $headdata['base']; $data['cano_link'] = $headdata['cano_link']; $data['count'] = $count; $data['refresh'] = $headdata['refresh']; $data['wait'] = $headdata['wait']; if ($clear == 1) { unset($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host); } return $data; }
function clean_file($file, $url, $type) { global $entities, $index_host, $index_meta_keywords, $utf8, $case_sensitive; $urlparts = parse_url($url); $host = $urlparts['host']; //remove filename from path $path = eregi_replace('([^/]+)$', "", $urlparts['path']); $file = preg_replace("/<link rel[^<>]*>/i", " ", $file); $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file); $file = preg_replace("@<!--.*?-->@si", " ", $file); $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file); $headdata = get_head_data($file); $regs = array(); if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) { $title = trim($regs[1]); $file = str_replace($regs[0], "", $file); } else { if ($type == 'pdf' || $type == 'doc' || $type == 'ppt' || $type == 'rtf' || $type == 'xls') { //create title for a non-html files //$title = substr($file, 0, strrpos(substr($file, 0, 40), " ")); $offset = strrpos($url, '/'); // get document name $title = substr($url, $offset + 1); } } $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file); //create spaces between tags, so that removing tags doesnt concatenate strings $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file); $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file); $file = strip_tags($file); $file = preg_replace("/ /", " ", $file); $fulltext = $file; $file .= " " . $title; if ($index_host == 1) { // separate words in host and path $host_sep = preg_replace("/\\.|\\/|\\\\/", " ", $host); $path_sep = preg_replace("/\\.|\\/|\\\\/", " ", $path); $file = $file . " " . $host . " " . $host_sep . " " . ucwords($host_sep); $file = $file . " " . $path . " " . $path_sep . " " . ucwords($path_sep); } if ($index_meta_keywords == 1) { $file = $file . " " . $headdata['keywords']; } //replace codes with ascii chars $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file); $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file); if ($utf8 != 1) { // if we want to buiild a UTF8 coded database, we also need the upper-case characters $file = lower_case($file); } reset($entities); while ($char = each($entities)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); } $file = preg_replace("/&[a-z]{1,6};/", " ", $file); $trash = array("\r\n", "\n", "\r"); // kill 'LF' and the others $replace = ' '; $file = str_replace($trash, $replace, $file); $trash = array("\\r\\n", "\\n", "\\r"); // kill 'LF' and the others $replace = ' '; $file = str_replace($trash, $replace, $file); if ($utf8 == '0') { $file = preg_replace("/\\s+/", " ", $file); // kill whitespace character $fulltext = html_entity_decode($fulltext); // compatible with Suggest Framework } $data['fulltext'] = addslashes($fulltext); $data['content'] = addslashes($file); $data['title'] = addslashes($title); $data['description'] = $headdata['description']; $data['keywords'] = $headdata['keywords']; $data['host'] = $host; $data['path'] = $path; $data['nofollow'] = $headdata['nofollow']; $data['noindex'] = $headdata['noindex']; $data['base'] = $headdata['base']; unset($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host); return $data; }
function clean_file($file, $url, $type) { global $entities, $index_host, $index_meta_keywords; global $db; $urlparts = parse_url($url); $host = $urlparts['host']; //remove filename from path $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']); $file = preg_replace("/<link rel[^<>]*>/i", " ", $file); $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file); $file = preg_replace("@<!--.*?-->@si", " ", $file); $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file); $headdata = get_head_data($file); $regs = array(); if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) { $title = trim($regs[1]); $file = str_replace($regs[0], "", $file); } else { if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words $title = substr($file, 0, strrpos(substr($file, 0, 40), " ")); } } $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file); //create spaces between tags, so that removing tags does not concatenate strings $file = preg_replace("/<[\\w ]+\\/?>/", "\\0 ", $file); $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file); $file = strip_tags($file); $file = preg_replace("/ /", " ", $file); // trim the contents, deleting leading & trailing spaces, plus duplicate spaces $file = preg_replace("/\\s\\s+/", " ", trim($file)); $fulltext = $file; if (isset($title)) { $file .= " " . $title; } if ($index_host == 1) { $file = $file . " " . $host . " " . $path; } if ($index_meta_keywords == 1 && isset($headdata['keywords'])) { $file = $file . " " . $headdata['keywords']; } //translate the page from UTF-8 to Latin-1 (redundant if all pages use UTF-8 encoding) $file = utf8_decode($file); //replace codes with ascii chars $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file); $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file); $file = strtolower($file); reset($entities); while ($char = each($entities)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); } $file = preg_replace("/&[a-z]{1,6};/", " ", $file); /* remove remaining HTML entities (note that the text is already converted to Latin-1) */ $file = preg_replace("/[\\*\\^\\+\\?\\\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\'\\/@#£\$%&=`´;><:,]+/", " ", $file); //periods are special: they are normally replace by spaces, but a period //that serves as a decimal point must be preserved $file = preg_replace("/\\.\\.+/", " ", $file); $file = preg_replace("/(([^0-9])\\.)|(\\.([^0-9]))/", "\$2 \$4", $file); //replace multiple spaces by a single one (and replace TABs by spaces too) $file = preg_replace("/\\s+/", " ", $file); $data['fulltext'] = $fulltext; $data['content'] = $file; $data['title'] = isset($title) ? $title : ""; $data['description'] = isset($headdata['description']) ? $headdata['description'] : ""; $data['language'] = isset($headdata['language']) ? $headdata['language'] : ""; $data['keywords'] = isset($headdata['keywords']) ? $headdata['keywords'] : ""; $data['host'] = $host; $data['path'] = $path; $data['nofollow'] = isset($headdata['nofollow']) ? $headdata['nofollow'] : ""; $data['noindex'] = isset($headdata['noindex']) ? $headdata['noindex'] : ""; $data['base'] = isset($headdata['base']) ? $headdata['base'] : ""; return $data; }
function clean_file($file, $url, $type) { global $mysqli_conn, $entities, $index_host, $index_meta_keywords; $urlparts = parse_url($url); $host = $urlparts['host']; //remove filename from path $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']); $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file); $file = preg_replace("@<!--.*?-->@si", " ", $file); $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file); $headdata = get_head_data($file); //if we have no charset for page we use utf-8 by default if (trim($headdata['charset']) == "") { $headdata['charset'] = "utf-8"; } $regs = array(); if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) { $title = trim($regs[1]); $file = str_replace($regs[0], "", $file); } else { if ($type == 'pdf' || $type == 'doc') { //the title of a non-html file is its first few words $title = substr($file, 0, strrpos(substr($file, 0, 40), " ")); } } $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file); //create spaces between tags, so that removing tags doesnt concatenate strings $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file); //\\0 $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file); $file = strip_tags($file); $file = preg_replace("/ /", " ", $file); $file = preg_replace("/&/", " ", $file); $fulltext = $file; $file .= " " . $title; if ($index_host == 1) { $file = $file . " " . $host . " " . $path; } if ($index_meta_keywords == 1) { $file = $file . " " . $headdata['keywords']; } //replace codes with ascii chars $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file); $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file); $file = mb_strtolower($file, $headdata['charset']); reset($entities); while ($char = each($entities)) { $file = preg_replace("/" . $char[0] . "/i", $char[1], $file); } $file = preg_replace("/&[a-z]{1,6};/", " ", $file); $file = preg_replace("/[\\[\\*\\^\\+\\?\\\\.\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\/@#\$%&=;><:,=]+/", " ", $file); //$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#?$%&=`?-;><:,]+/", " ", $file); mb_internal_encoding($headdata['charset']); $file = preg_replace("/\n/", " ", $file); $file = preg_replace("/\t/", " ", $file); // $file = preg_replace("/\w+/", " ", $file); //we must convert page to utf-8 to prevent any bugs when works with multilang sites if ($headdata['charset'] != "utf-8") { $file = iconv($headdata['charset'], 'utf-8', $file); $fulltext = iconv($headdata['charset'], 'utf-8', $fulltext); $title = iconv($headdata['charset'], 'utf-8', $title); $description = iconv($headdata['charset'], 'utf-8', $description); $keywords = iconv($headdata['charset'], 'utf-8', $keywords); } //echo $file; //couse some problems with russian we need to replace letter '�' and '�' with '�' and '�' $file = preg_replace("/Ё/", "Е", $file); $file = preg_replace("/ё/", "е", $file); $data['fulltext'] = addslashes($fulltext); $data['content'] = addslashes($file); $data['title'] = addslashes($title); $data['description'] = $headdata['description']; $data['keywords'] = $headdata['keywords']; $data['host'] = $host; $data['path'] = $path; $data['nofollow'] = $headdata['nofollow']; $data['noindex'] = $headdata['noindex']; $data['base'] = $headdata['base']; $data['charset'] = $headdata['charset']; // print_r($data); return $data; }