$tmp = get_meta_tags("data://{$mime};base64," . base64_encode($content)); if (isset($tmp['description'])) { $desc = preg_replace('/[^(\\x20-\\x7F)]*/', '', trim($tmp['description'])); } else { $desc = ''; } // This is the rest of the content. We try to clean it somewhat using // the custom function html2text which works 90% of the tiem $content = preg_replace('/[^(\\x20-\\x7F)]*/', '', trim(strip_tags(html2txt($content)))); // If values arent set lets try to set them here. Start with desc // using content and then try the title using desc if ($desc == '' && $content != '') { $desc = substr($content, 0, 200) . '...'; } if ($title == '' && $desc != '') { $title = substr($desc, 0, 50) . '...'; } $count++; // If we dont have a title, then we dont have desc or content // so lets not add it to the index if ($title != '') { $toindex[] = array($url, $title, $desc, $rank); echo 'INDEXING ' . $count . "\r\n"; } else { echo 'SKIP ' . $count . "\r\n"; } } } echo "Starting Index\r\n"; $indexer->index($toindex);
//add document to db and get the docID $arr = explode(" = ", substr($line, 1)); //set the document data $document[$arr[0]] = $arr[1]; $offset += strlen($line); } if ($counter > 3) { break; } $counter++; } $document["content"] = stream_get_contents($fp, -1, $offset); $documents[] = $document; } } $indexer->index($documents); $fp = opendir(SOURCE_DIR); while (false !== ($file = readdir($fp))) { if (is_file(SOURCE_DIR . $file)) { unlink(SOURCE_DIR . $file); } } ?> <h1> documents added successfully! </h1> <a href="admin.php">back</a>