function docreader_get_description($doc) { if (preg_match('|<meta name="description" content="([^"]*?)" />|is', $doc, $regs)) { return preg_replace('|[\\r\\n\\t ]+|s', ' ', trim(strip_tags($regs[1]))); } return substr(docreader_get_body($doc), 0, 255) . '...'; }
// get content type (ie. if pdf, don't parse as an html file) $finfo = pathinfo($f); $fext = strtolower($finfo['extension']); if (in_array($fext, array('html', 'htm'))) { // parse file $doc = docreader_get_data($f); if (!$doc) { continue; } $title = docreader_get_title($doc); if (!$title) { $title = 'Untitled'; } $description = docreader_get_description($doc); $keywords = docreader_get_keywords($doc); $body = extractor_run(docreader_get_body($doc), 'HTML'); unset($doc); } else { $body = extractor_run($f); if (!$body) { $body = ''; } $description = ''; $keywords = ''; $title = basename($f); } $data = array('title' => $title, 'url' => $url, 'description' => $description, 'keywords' => $keywords, 'body' => $body, 'access' => 'public', 'status' => 'approved', 'team' => 'none', 'ctype' => $ctype, 'mtime' => (string) $mtime, 'domain' => $domain); // add file to index $counts[$ctype]++; $res = $search->addDocument($data); if (!$res) {