Example #1
0
function clean_file($file, $url, $type)
{
    global $entities, $index_host, $index_meta_keywords;
    $index_meta_keywords = 1;
    $index_host = 0;
    $urlparts = parse_url($url);
    $host = $urlparts['host'];
    //remove filename from path
    $path = eregi_replace('([^/]+)$', "", $urlparts['path']);
    $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
    $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file);
    $file = preg_replace("@<!--.*?-->@si", " ", $file);
    $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file);
    $headdata = get_head_data($file);
    $regs = array();
    if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) {
        $title = trim($regs[1]);
        $file = str_replace($regs[0], "", $file);
    } else {
        if ($type == 'pdf' || $type == 'doc') {
            //the title of a non-html file is its first few words
            $title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
        }
    }
    $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file);
    //create spaces between tags, so that removing tags doesnt concatenate strings
    $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file);
    $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file);
    $file = strip_tags($file);
    $file = preg_replace("/&nbsp;/", " ", $file);
    $fulltext = $file;
    $file .= " " . $title;
    if ($index_host == 1) {
        $file = $file . " " . $host . " " . $path;
    }
    if ($index_meta_keywords == 1) {
        $file = $file . " " . $headdata['keywords'];
    }
    //replace codes with ascii chars
    $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
    $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
    $file = strtolower($file);
    reset($entities);
    while ($char = each($entities)) {
        $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
    }
    $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
    $file = preg_replace("/[\\*\\^\\+\\?\\\\.\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\/@#?%&=`?><:,]+/", " ", $file);
    $file = preg_replace("/\\s+/", " ", $file);
    $data['fulltext'] = addslashes($fulltext);
    $data['content'] = addslashes($file);
    $data['title'] = addslashes($title);
    $data['description'] = $headdata['description'];
    $data['keywords'] = $headdata['keywords'];
    $data['host'] = $host;
    $data['path'] = $path;
    $data['nofollow'] = $headdata['nofollow'];
    $data['noindex'] = $headdata['noindex'];
    $data['base'] = $headdata['base'];
    return $data;
}
function clean_file($file, $url, $type, $charSet, $use_nofollow, $use_robot, $can_leave_domain)
{
    global $db_con, $entities, $index_host, $index_meta_keywords, $index_meta_description, $case_sensitive, $utf_16;
    global $home_charset, $chrSet, $del_secchars, $index_rss, $converter_dir, $div_all, $div_hyphen, $del_dups;
    global $bb_decode, $ent_decode, $cn_seg, $quotes, $dup_quotes, $clear, $only_links, $text_length, $strict_high;
    global $use_divs, $not_divs, $not_divlist, $use_divlist, $ignore_fulltxt, $index_meta_title, $js_reloc;
    global $use_elems, $not_elems, $use_elementslist, $not_elementslist, $del_elems, $conv_puny, $include_dir;
    $new = array();
    $data = array();
    $string = '';
    $home_charset = strtoupper($home_charset);
    if ($utf_16) {
        //$file = mb_ereg_replace("\\0", "", $file);
        $file = utf16_to_utf8($file);
    }
    //      kill useless blanks, under scores and line feeds
    $file = preg_replace("/[  |\r\n|\\_]+/i", " ", $file);
    $urlparts = parse_addr($url);
    $host = $urlparts['host'];
    //remove filename from path and all tags which should be ignored
    $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']);
    if ($use_nofollow == '1') {
        $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file);
    }
    //  parse the HTML head
    $headdata = get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain, $type);
    $title = $headdata['title'];
    $description = $headdata['description'];
    $keywords = $headdata['keywords'];
    $file = preg_replace("@<head>.*?</head>@si", " ", $file);
    //  remove HTML head from file
    $file = preg_replace("@<!--.*?-->@si", " ", $file);
    $file = preg_replace("@<script[^>]*?>.*?<\\/script>@si", " ", $file);
    //$file = str_replace("window.location.replace", " ", $file);
    $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file);
    $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
    $file = preg_replace("@<div style=(\"|')display\\:none(\"|').*?<\\/div>@si", " ", $file);
    $file = preg_replace("@<a.*?>@si", " ", $file);
    $file = preg_replace("@<(object|img|audio|video).*?>@si", " ", $file);
    $file = preg_replace("@<(align|alt|data|body|form|height|input|id|name|span|src|table|td|type|width|layer|span).*?>@si", " ", $file);
    $file = preg_replace("@\\{document\\..*?\\}@si", " ", $file);
    //  if activated in Admin settings, ignore the full text
    if ($ignore_fulltxt == '1') {
        $file = '';
    }
    // if activated in Admin settings, remove all div contents as defined in common 'divs_not' list
    if ($not_divs == '1') {
        // JFIELD parse the doc into a DOM tree so we can
        // do cool stuff like exclude certain divs
        // echo "<pre>\n";
        global $myFile;
        $myFile = "";
        $myDepth = 0;
        $dom = new DOMDocument();
        $dom->loadHTML($file);
        /*
        // JFIELD figuring out image exclusion and stuff
        echo "<pre>HEY:\n";
        echo "$title\n";
        echo "$description\n";
        echo "$url\n";
        //var_dump( $dom->getElementById('shared-image-desc') );
        echo "</pre>\n";
        */
        // probably a better way to get the doc
        // than skipping over the non-doc like this
        foreach ($dom->childNodes as $item) {
            if (!$item->tagName) {
                continue;
            }
            recurseNodes($item);
        }
        $file = $myFile;
        // wikipedia - don't index content of image pages
        if (preg_match("/\\/images\\//", $url) && preg_match("/^File\\:/", $title)) {
            // image description stopwords
            $mystopwords = explode(" ", "wikipedia schools english featured article sos children file");
            $mydesc = str_replace($mystopwords, " ", strtolower($description));
            $description = '';
            // otherwise the unstripped version gets used
            $title = str_replace("File:", "", $title);
            // remove this noise
            $file = "{$title} {$mydesc} picture image";
        }
        // echo "</pre>\n";
        // END JFIELD
    }
    // if activated in Admin settings, fetch all div contents as defined in common 'divs_use' list
    if ($use_divs == '1') {
        foreach ($use_divlist as $thisid) {
            //    try to find divs with id as specified in common 'divs' list
            //  regexp ?
            if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid) - 1) {
                $thisid = substr($thisid, 2, strlen($thisid) - 3);
                //  remove the regex capsules
            } else {
                //  for string input only
                if (strrpos($thisid, "*") == strlen($thisid) - 1) {
                    $thisid = str_replace("*", "(.*?)", $thisid);
                    //  replace wildcards at the end of string input
                }
            }
            if (preg_match_all("@(<div class|<div id)=(\"|')" . $thisid . "(\"|').*?(</div>)@si", $file, $found_divs, PREG_OFFSET_CAPTURE)) {
                foreach ($found_divs[0] as $another_div) {
                    //  walk through all found divs. Usually W3C does not allow more than one div with this id. But who knows . . . .
                    $this_divstart = $another_div[1];
                    //  get actual startpos from div-array
                    $i = "end";
                    //  if required $i will become the loop counter for nested divs
                    $nextstart = strpos($file, "<div", $this_divstart + 4);
                    //  find start pos of next div
                    $nextend = strpos($file, "</div", $this_divstart + 4);
                    //  find end pos of next div
                    //check for nested divs
                    $start1 = strpos($file, "<div", $nextstart + 4);
                    // find start pos of next div
                    if ($start1 && $start1 < $nextend) {
                        $i = "0";
                        //  yes, nested
                    }
                    while ($i != "end") {
                        //  loop for (multiple) 'nested divs'
                        $i = '0';
                        while ($nextstart && $nextstart < $nextend) {
                            // next div is a nested div?
                            $nextend1 = strpos($file, "</div", $nextstart + 4);
                            //  this is only the endpos of current div
                            $nextend = strpos($file, "</div", $nextend1 + 6);
                            //  find end pos of next div
                            $nextstart = strpos($file, "<div", $nextstart + 4);
                            // find start pos of next div
                            if ($nextstart && $nextstart < $nextend1) {
                                //  again nested in next layer?
                                $i++;
                                //  counter for next level nested divs
                            }
                        }
                        //  if nested divs were found, correct end pos of div to be deleted
                        while ($i > '1') {
                            $nextend = strpos($file, "</div", $nextend + 6);
                            $i--;
                        }
                        $nextend1 = strpos($file, "</div", $nextend + 6);
                        //  $nextend from former div (might have been nested)
                        if ($nextend1) {
                            $nextend = $nextend1;
                            //  defines next endpos
                        }
                        if (!$nextstart || $nextend < $nextstart) {
                            $i = 'end';
                            //  no longer nested divs
                        }
                    }
                    //  collect all divs to be indexed
                    $all_divs[] = substr($file, $this_divstart, $nextend + 6 - $this_divstart);
                }
                //  add content of all found divs to full text
                foreach ($all_divs as $use_thisdiv) {
                    $divfile .= " " . $use_thisdiv;
                }
            }
        }
        $file = $divfile;
        //  now this will be used as the body part of the page content
    }
    // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_use' list and use the content of these elements as page content
    if ($use_elems == '1') {
        foreach ($use_elementslist as $this_element) {
            //    try to find elements with id as specified in common 'elöements_use' list
            //  regexp ?
            if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element) - 1) {
                $this_element = substr($this_element, 2, strlen($this_element) - 3);
                //  remove the regex capsules
            }
            if (preg_match_all("@<{$this_element}.*?>.*?<\\/{$this_element}>@si", $file, $found_elements, PREG_OFFSET_CAPTURE)) {
                foreach ($found_elements as $new_element) {
                    //  walk through all found elementss.
                    foreach ($new_element as $new) {
                        //  build substring without content tags
                        $string = $new[0];
                        $string = substr($string, strpos($string, ">") + 1);
                        $string = substr($string, 0, strrpos($string, "<"));
                        //  collect all elements to be indexed
                        $all_elements[] = $string;
                    }
                }
            }
        }
        $file = '';
        //  add content of all found elements to full text
        foreach ($all_elements as $use_thiselem) {
            $file .= " " . $use_thiselem;
            //  now all this will be used as the body part of the page content
        }
    }
    // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_not' list and delete that part of the page
    if ($not_elems == '1') {
        foreach ($not_elementslist as $this_element) {
            //    try to find elements with id as specified in common 'elements_not' list
            //  regexp ?
            if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element) - 1) {
                $this_element = substr($this_element, 2, strlen($this_element) - 3);
                //  remove the regex capsules
            }
            if (preg_match_all("@<{$this_element}.*?>.*?<\\/{$this_element}>@si", $file, $found_elements, PREG_OFFSET_CAPTURE)) {
                foreach ($found_elements as $new_element) {
                    //  walk through all found elementss.
                    foreach ($new_element as $new) {
                        //  collect all elements to be ignored
                        $all_elements[] = $new[0];
                    }
                }
            }
        }
        //  remove the content of all found elements from full text
        foreach ($all_elements as $use_thiselem) {
            $file = str_replace($use_thiselem, " ", $file);
        }
    }
    //  parse bbcode
    if ($bb_decode == '1') {
        $file = bbcode($file);
    }
    $file = preg_replace("@<div.*?>@si", " ", $file);
    $file = preg_replace("@<\\/.*?>@si", " ", $file);
    //create spaces between tags, so that removing tags doesnt concatenate strings
    $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file);
    $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file);
    $file = preg_replace("@<\\/a>@si", " ", $file);
    //  remove lost end tag
    //$file = strip_tags($file);  //  remove the content of HTML tags from $file (does not work for invalid written and unclosed tags)
    //  replaced since Sphider-plus version 2.7
    //  remove the content of HTML tags from $file
    $found_tags = array();
    $another_tag = array();
    if (preg_match_all("@<.*?>@s", $file, $found_tags, PREG_OFFSET_CAPTURE)) {
        foreach ($found_tags[0] as $another_tag) {
            //  walk through all found tags.
            if (strlen($another_tag[0]) < "500") {
                //  delete this tag from full text if not too long (unclosed)
                $file = str_replace($another_tag[0], " ", $file);
            }
        }
    }
    if ($del_elems) {
        //  if activated in Admin backend, delete  &lt; element /&gt; from full text
        $found_tags = array();
        $another_tag = array();
        if (preg_match_all("@\\&lt;.*?\\&gt;@s", $file, $found_tags, PREG_OFFSET_CAPTURE)) {
            foreach ($found_tags[0] as $another_tag) {
                //  walk through all found tags.
                $file = str_replace($another_tag[0], " ", $file);
            }
        }
    }
    if ($conv_puny) {
        //  make punycode readable
        require_once "{$include_dir}/idna_converter.php";
        // Initialize the converter class
        $IDN = new idna_convert(array('idn_version' => 2008));
        $found_tags = array();
        $another_tag = array();
        $this_tag = '';
        $file = str_replace("http", " http", $file);
        //place a blank in front of all http's
        if (preg_match_all("@http.*? @s", $file, $found_tags, PREG_OFFSET_CAPTURE)) {
            foreach ($found_tags[0] as $another_tag) {
                //  walk through all found tags.
                // Decode the URL to readable format
                $this_tag = $IDN->decode(rawurldecode($another_tag[0]));
                $this_tag = rawurldecode($this_tag);
                $file = str_replace($another_tag[0], $this_tag, $file);
            }
        }
    }
    $file = str_replace(" ", " ", $file);
    //  replace special (long) blanks with standard blank
    $file = str_replace("—", "'", $file);
    //  replace  invalid coded quotations
    $file = str_replace("©", "&#151;", $file);
    //  replace  invalid coded long dash with correct long dash
    $file = preg_replace("/   +/", " ", $file);
    //  replace TABs with a standard blank
    $file = preg_replace("/  +/", " ", $file);
    //  kill duplicate blanks
    $file = preg_replace("/__+/", " ", $file);
    //  kill duplicate underscore
    $file = preg_replace("/--+/", " ", $file);
    //  kill duplicate hyphens
    $file = preg_replace("/\\*\\*+/", " ", $file);
    //  kill duplicate stars
    $file = preg_replace("/\\#\\#+/", " ", $file);
    //  kill duplicate hash tags
    $file = str_replace(" &nbsp;", " ", $file);
    $file = str_replace("&nbsp;&nbsp;", " ", $file);
    //  kill duplicate &nbsp; blanks
    $file = str_replace("&shy;", "", $file);
    //  kill  break character
    $file = preg_replace("/\\☨\\☨+/", " ", $file);
    //  kill duplicates. . .  Yes, I've met something
    $file = preg_replace("/\\(\\(+/", " ", $file);
    //  kill duplicates.  . .  no comment
    $file = preg_replace("/\\<\\<+/", " ", $file);
    //  kill duplicates
    $file = preg_replace("/\\>\\>+/", " ", $file);
    //  kill duplicates
    $file = preg_replace("/\\*\\~+/", " ", $file);
    //  kill duplicates
    $file = preg_replace("/\\+\\++/", " ", $file);
    //  kill duplicates
    $file = preg_replace("/\\=\\=+/", " ", $file);
    //  kill duplicates
    $file = preg_replace("/\\~\\~+/", " ", $file);
    //  kill duplicates
    //  kill some other duplicates, already met on the Internet
    if ($del_dups) {
        $file = preg_replace("/\\(\\(+/", " ", $file);
        $file = preg_replace("/\\)\\)+/", " ", $file);
        $file = preg_replace("/\\~\\~+/", " ", $file);
        $file = preg_replace("/\\=\\=+/", " ", $file);
        $file = preg_replace("/\\?\\?+/", " ", $file);
        $file = preg_replace("/\\!\\!+/", " ", $file);
        $file = preg_replace("/\\.\\.+/", " ", $file);
        $file = preg_replace("/\\<\\<+/", " ", $file);
        $file = preg_replace("/\\>\\>+/", " ", $file);
        $file = preg_replace("/\\:\\:+/", " ", $file);
        $file = preg_replace("/\\+\\++/", " ", $file);
        $file = preg_replace("/\\-\\-+/", " ", $file);
        $file = preg_replace("/\\*\\*+/", " ", $file);
    }
    $file = str_replace(" &nbsp;", " ", $file);
    $file = str_replace("&nbsp;&nbsp;", " ", $file);
    //  kill duplicate &nbsp; blanks
    $file = str_replace("&shy;", "", $file);
    //  kill  break character
    //  kill some special cases
    $file = str_replace("&quot;", "\"", $file);
    $file = str_replace("…", " ", $file);
    if ($text_length != "0") {
        //  build substring of full text until last space in front of $text_length
        $file = substr($file, 0, strrpos(substr($file, 0, $text_length), " "));
    }
    if ($index_host == 1) {
        //  separate words in host and path
        $host_sep = preg_replace("/\\.|\\/|\\\\/", " ", $host);
        $path_sep = preg_replace("/\\.|\\/|\\\\/", " ", $path);
        $file = $file . " " . $host . " " . $host_sep;
        $file = $file . " " . $path . " " . $path_sep;
    }
    if ($headdata['title'] && $index_meta_title) {
        $file = $file . " " . $title;
    }
    if ($index_meta_description == 1) {
        $file = $file . " " . $description;
    }
    if ($index_meta_keywords == 1) {
        $file = $file . " " . $keywords;
    }
    if ($ent_decode == '1') {
        //  as it seems, the PHP function html_entity_decode() has some problems.
        //  In case that 2 entities are placed directly together like: &mdash;&nbsp;
        //  we are obliged to be helpful by eliminating one of them
        $file = str_replace("&nbsp;", " ", $file);
        //  now PHP does not get confused
        $file = html_entity_decode($file, ENT_QUOTES, 'UTF-8');
        $title = str_replace("&nbsp;", " ", $title);
        $title = html_entity_decode($title, ENT_QUOTES, 'UTF-8');
    }
    //  correct some other trash found on the Internet
    $file = str_replace("�", "fi", $file);
    $file = str_replace("fl", "fl", $file);
    //  for URLs use entities, so that links become readable in full text
    $file = str_replace("<a href=\"http://www.", "&lt;a href=&quot;http://www.", $file);
    //  replace .. with a standard blank
    $file = str_replace("...", " ", $file);
    //  kill duplicate blanks  " ", \r, \t, \n and \f
    if (preg_match("@8859|utf@", $charSet)) {
        $file = preg_replace("/[\\s,]+/", " ", $file);
    }
    if ($index_rss == '1') {
        $file = preg_replace('/0b/si', '.', $file);
        // try to correct bad charset interpretation
        $file = preg_replace('//si', '\'', $file);
        $trash = array("\r\n", "\n", "\r", "0E", "0C", "0I");
        // kill 'LF' and the others
    } else {
        $trash = array("\r\n", "\f", "\n", "\r", "\t");
    }
    $replace = ' ';
    $file = str_replace($trash, $replace, $file);
    $fulltext = $file;
    //  required for result listing as extract around the keywords and for PHRASE search
    if ($del_secchars) {
        $file = del_secchars($file);
    }
    //  use the cleaned $file to just highlight the pure query term in result listing
    if ($strict_high) {
        $fulltext = $file;
    }
    //  convert all single quotes into standard quote
    if ($quotes == '1') {
        $all_quotes = array("&#8216;" => "'", "&lsquo;" => "'", "&#8217;" => "'", "&rsquo;" => "'", "&#8242;" => "'", "&prime;" => "'", "‘" => "'", "‘" => "'", "´" => "'", "`" => "'", "’" => "'", "‘" => "'", "’" => "'", "’" => "'");
        //reset($all_quotes);
        while ($char = each($all_quotes)) {
            $file = preg_replace("/" . $char[0] . "/si", $char[1], $file);
            $title = preg_replace("/" . $char[0] . "/si", $char[1], $title);
            $description = preg_replace("/" . $char[0] . "/si", $char[1], $description);
            $keywords = preg_replace("/" . $char[0] . "/si", $char[1], $keywords);
        }
    }
    //  convert all double quotes into standard quotations
    if ($dup_quotes == '1') {
        $all_quotes = array("“" => "\"", "�" => "\"", "„" => "\"");
        reset($all_quotes);
        while ($char = each($all_quotes)) {
            $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
            $title = preg_replace("/" . $char[0] . "/i", $char[1], $title);
            $description = preg_replace("/" . $char[0] . "/i", $char[1], $description);
            $keywords = preg_replace("/" . $char[0] . "/i", $char[1], $keywords);
        }
    }
    //  split words at hyphen, single quote, dot and comma into their basics
    if ($div_all || $div_hyphen) {
        $file = split_words($file);
        // jfield: yes, we want to index words split on hyphens,
        // but what does that have to do with changing the appearance
        // of the title? sphider_plus, you so crazy
        // $title          = split_words($title);
        $description = split_words($description);
        $keywords = split_words($keywords);
    }
    reset($entities);
    while ($char = each($entities)) {
        $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
        $title = preg_replace("/" . $char[0] . "/i", $char[1], $title);
        $description = preg_replace("/" . $char[0] . "/i", $char[1], $description);
        $keywords = preg_replace("/" . $char[0] . "/i", $char[1], $keywords);
    }
    //  replace special (long) blanks in title
    $title = str_replace(" ", " ", $title);
    //remove all the fancy jokes some webmasters add
    $title = preg_replace("@<(.*?)>@si", "", $title);
    $title = preg_replace("@ +@si", " ", $title);
    //  replace TABs with a standard blank
    $fulltext = preg_replace("/   +/", " ", $fulltext);
    $count = count(preg_split("/[\\s,]+/", $fulltext));
    $data['fulltext'] = $db_con->real_escape_string($fulltext);
    $data['content'] = $file;
    $data['title'] = $db_con->real_escape_string($title);
    $data['description'] = $db_con->real_escape_string($description);
    $data['keywords'] = $db_con->real_escape_string($keywords);
    $data['host'] = $host;
    $data['path'] = $path;
    $data['nofollow'] = $headdata['nofollow'];
    $data['noindex'] = $headdata['noindex'];
    $data['base'] = $headdata['base'];
    $data['cano_link'] = $headdata['cano_link'];
    $data['count'] = $count;
    $data['refresh'] = $headdata['refresh'];
    $data['wait'] = $headdata['wait'];
    if ($clear == 1) {
        unset($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host);
    }
    return $data;
}
Example #3
0
function clean_file($file, $url, $type)
{
    global $entities, $index_host, $index_meta_keywords, $utf8, $case_sensitive;
    $urlparts = parse_url($url);
    $host = $urlparts['host'];
    //remove filename from path
    $path = eregi_replace('([^/]+)$', "", $urlparts['path']);
    $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
    $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file);
    $file = preg_replace("@<!--.*?-->@si", " ", $file);
    $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file);
    $headdata = get_head_data($file);
    $regs = array();
    if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) {
        $title = trim($regs[1]);
        $file = str_replace($regs[0], "", $file);
    } else {
        if ($type == 'pdf' || $type == 'doc' || $type == 'ppt' || $type == 'rtf' || $type == 'xls') {
            //create title for a non-html files
            //$title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
            $offset = strrpos($url, '/');
            //      get document name
            $title = substr($url, $offset + 1);
        }
    }
    $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file);
    //create spaces between tags, so that removing tags doesnt concatenate strings
    $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file);
    $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file);
    $file = strip_tags($file);
    $file = preg_replace("/&nbsp;/", " ", $file);
    $fulltext = $file;
    $file .= " " . $title;
    if ($index_host == 1) {
        //  separate words in host and path
        $host_sep = preg_replace("/\\.|\\/|\\\\/", " ", $host);
        $path_sep = preg_replace("/\\.|\\/|\\\\/", " ", $path);
        $file = $file . " " . $host . " " . $host_sep . " " . ucwords($host_sep);
        $file = $file . " " . $path . " " . $path_sep . " " . ucwords($path_sep);
    }
    if ($index_meta_keywords == 1) {
        $file = $file . " " . $headdata['keywords'];
    }
    //replace codes with ascii chars
    $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
    $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
    if ($utf8 != 1) {
        // if we want to buiild a UTF8 coded database, we also need the upper-case characters
        $file = lower_case($file);
    }
    reset($entities);
    while ($char = each($entities)) {
        $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
    }
    $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
    $trash = array("\r\n", "\n", "\r");
    // kill 'LF' and the others
    $replace = ' ';
    $file = str_replace($trash, $replace, $file);
    $trash = array("\\r\\n", "\\n", "\\r");
    // kill 'LF' and the others
    $replace = ' ';
    $file = str_replace($trash, $replace, $file);
    if ($utf8 == '0') {
        $file = preg_replace("/\\s+/", " ", $file);
        //  kill whitespace character
        $fulltext = html_entity_decode($fulltext);
        //  compatible with Suggest Framework
    }
    $data['fulltext'] = addslashes($fulltext);
    $data['content'] = addslashes($file);
    $data['title'] = addslashes($title);
    $data['description'] = $headdata['description'];
    $data['keywords'] = $headdata['keywords'];
    $data['host'] = $host;
    $data['path'] = $path;
    $data['nofollow'] = $headdata['nofollow'];
    $data['noindex'] = $headdata['noindex'];
    $data['base'] = $headdata['base'];
    unset($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host);
    return $data;
}
function clean_file($file, $url, $type)
{
    global $entities, $index_host, $index_meta_keywords;
    global $db;
    $urlparts = parse_url($url);
    $host = $urlparts['host'];
    //remove filename from path
    $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']);
    $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
    $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file);
    $file = preg_replace("@<!--.*?-->@si", " ", $file);
    $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file);
    $headdata = get_head_data($file);
    $regs = array();
    if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) {
        $title = trim($regs[1]);
        $file = str_replace($regs[0], "", $file);
    } else {
        if ($type == 'pdf' || $type == 'doc') {
            //the title of a non-html file is its first few words
            $title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
        }
    }
    $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file);
    //create spaces between tags, so that removing tags does not concatenate strings
    $file = preg_replace("/<[\\w ]+\\/?>/", "\\0 ", $file);
    $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file);
    $file = strip_tags($file);
    $file = preg_replace("/&nbsp;/", " ", $file);
    // trim the contents, deleting leading & trailing spaces, plus duplicate spaces
    $file = preg_replace("/\\s\\s+/", " ", trim($file));
    $fulltext = $file;
    if (isset($title)) {
        $file .= " " . $title;
    }
    if ($index_host == 1) {
        $file = $file . " " . $host . " " . $path;
    }
    if ($index_meta_keywords == 1 && isset($headdata['keywords'])) {
        $file = $file . " " . $headdata['keywords'];
    }
    //translate the page from UTF-8 to Latin-1 (redundant if all pages use UTF-8 encoding)
    $file = utf8_decode($file);
    //replace codes with ascii chars
    $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
    $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
    $file = strtolower($file);
    reset($entities);
    while ($char = each($entities)) {
        $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
    }
    $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
    /* remove remaining HTML entities (note that the text is already converted to Latin-1) */
    $file = preg_replace("/[\\*\\^\\+\\?\\\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\'\\/@#£\$%&=`´;><:,]+/", " ", $file);
    //periods are special: they are normally replace by spaces, but a period
    //that serves as a decimal point must be preserved
    $file = preg_replace("/\\.\\.+/", " ", $file);
    $file = preg_replace("/(([^0-9])\\.)|(\\.([^0-9]))/", "\$2 \$4", $file);
    //replace multiple spaces by a single one (and replace TABs by spaces too)
    $file = preg_replace("/\\s+/", " ", $file);
    $data['fulltext'] = $fulltext;
    $data['content'] = $file;
    $data['title'] = isset($title) ? $title : "";
    $data['description'] = isset($headdata['description']) ? $headdata['description'] : "";
    $data['language'] = isset($headdata['language']) ? $headdata['language'] : "";
    $data['keywords'] = isset($headdata['keywords']) ? $headdata['keywords'] : "";
    $data['host'] = $host;
    $data['path'] = $path;
    $data['nofollow'] = isset($headdata['nofollow']) ? $headdata['nofollow'] : "";
    $data['noindex'] = isset($headdata['noindex']) ? $headdata['noindex'] : "";
    $data['base'] = isset($headdata['base']) ? $headdata['base'] : "";
    return $data;
}
Example #5
0
function clean_file($file, $url, $type)
{
    global $mysqli_conn, $entities, $index_host, $index_meta_keywords;
    $urlparts = parse_url($url);
    $host = $urlparts['host'];
    //remove filename from path
    $path = preg_replace('/([^\\/]+)$/i', "", $urlparts['path']);
    $file = preg_replace("@<!--sphider_noindex-->.*?<!--\\/sphider_noindex-->@si", " ", $file);
    $file = preg_replace("@<!--.*?-->@si", " ", $file);
    $file = preg_replace("@<script[^>]*?>.*?</script>@si", " ", $file);
    $headdata = get_head_data($file);
    //if we have no charset for page we use utf-8 by default
    if (trim($headdata['charset']) == "") {
        $headdata['charset'] = "utf-8";
    }
    $regs = array();
    if (preg_match("@<title *>(.*?)<\\/title*>@si", $file, $regs)) {
        $title = trim($regs[1]);
        $file = str_replace($regs[0], "", $file);
    } else {
        if ($type == 'pdf' || $type == 'doc') {
            //the title of a non-html file is its first few words
            $title = substr($file, 0, strrpos(substr($file, 0, 40), " "));
        }
    }
    $file = preg_replace("@<style[^>]*>.*?<\\/style>@si", " ", $file);
    //create spaces between tags, so that removing tags doesnt concatenate strings
    $file = preg_replace("/<[\\w ]+>/", "\\0 ", $file);
    //\\0
    $file = preg_replace("/<\\/[\\w ]+>/", "\\0 ", $file);
    $file = strip_tags($file);
    $file = preg_replace("/&nbsp;/", " ", $file);
    $file = preg_replace("/&amp;/", " ", $file);
    $fulltext = $file;
    $file .= " " . $title;
    if ($index_host == 1) {
        $file = $file . " " . $host . " " . $path;
    }
    if ($index_meta_keywords == 1) {
        $file = $file . " " . $headdata['keywords'];
    }
    //replace codes with ascii chars
    $file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
    $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
    $file = mb_strtolower($file, $headdata['charset']);
    reset($entities);
    while ($char = each($entities)) {
        $file = preg_replace("/" . $char[0] . "/i", $char[1], $file);
    }
    $file = preg_replace("/&[a-z]{1,6};/", " ", $file);
    $file = preg_replace("/[\\[\\*\\^\\+\\?\\\\.\\[\\]\\^\$\\|\\{\\)\\(\\}~!\"\\/@#\$%&=;><:,=]+/", " ", $file);
    //$file = preg_replace("/[\*\^\+\?\\\.\[\]\^\$\|\{\)\(\}~!\"\/@#?$%&=`?-;><:,]+/", " ", $file);
    mb_internal_encoding($headdata['charset']);
    $file = preg_replace("/\n/", " ", $file);
    $file = preg_replace("/\t/", " ", $file);
    //	$file = preg_replace("/\w+/", " ", $file);
    //we must convert page to utf-8 to prevent any bugs when works with multilang sites
    if ($headdata['charset'] != "utf-8") {
        $file = iconv($headdata['charset'], 'utf-8', $file);
        $fulltext = iconv($headdata['charset'], 'utf-8', $fulltext);
        $title = iconv($headdata['charset'], 'utf-8', $title);
        $description = iconv($headdata['charset'], 'utf-8', $description);
        $keywords = iconv($headdata['charset'], 'utf-8', $keywords);
    }
    //echo $file;
    //couse some problems with russian we need to replace letter '�' and '�' with '�' and '�'
    $file = preg_replace("/Ё/", "Е", $file);
    $file = preg_replace("/ё/", "е", $file);
    $data['fulltext'] = addslashes($fulltext);
    $data['content'] = addslashes($file);
    $data['title'] = addslashes($title);
    $data['description'] = $headdata['description'];
    $data['keywords'] = $headdata['keywords'];
    $data['host'] = $host;
    $data['path'] = $path;
    $data['nofollow'] = $headdata['nofollow'];
    $data['noindex'] = $headdata['noindex'];
    $data['base'] = $headdata['base'];
    $data['charset'] = $headdata['charset'];
    //	print_r($data);
    return $data;
}