/** * 在这个方法内添加新 URL 过滤规则,主要是调用以下方法: * followExternal() * allowDomain(), disallowDomain() * allow(), disallow(), disallowExt() */ public function defaultFilter() { parent::defaultFilter(); /// --- custom filter BEGIN --- $this->followExternal(false); $this->disallow('.php?q='); /// --- custom filter END --- }
/** * {@inheritDoc} * * @param string $page the image represented as a character string * @param string $url the url where the image was downloaded from * @return array summary information including a thumbnail and a * description (where the description is just the url) */ function process($page, $url) { if (is_string($page)) { $image = $this->imagecreatefrombmp($page); $thumb_string = self::createThumb($image); $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = "Image of " . UrlParser::getDocumentFilename($url); $summary[self::LINKS] = array(); $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>"; $summary[self::THUMB] = 'data:image/jpeg;base64,' . base64_encode($thumb_string); } return $summary; }
/** * Used to recompute both the index shards and the dictionary * of an index archive. The first step involves re-extracting the * word into an inverted index from the summaries' web_archives. * Then a reindex is done. * * @param string $archive_path file path to a IndexArchiveBundle */ function rebuildIndexArchive($archive_path) { $archive_type = $this->getArchiveKind($archive_path); if ($archive_type != "IndexArchiveBundle") { $this->badFormatMessageAndExit($archive_path); } $info = $archive_type::getArchiveInfo($archive_path); $tmp = unserialize($info["DESCRIPTION"]); $video_sources = $tmp[self::VIDEO_SOURCES]; $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt")); $num_generations = $generation_info['ACTIVE'] + 1; $archive = new WebArchiveBundle($archive_path . "/summaries"); $seen = 0; $generation = 0; $keypad = ""; while ($generation < $num_generations) { $partition = $archive->getPartition($generation, false); $shard_name = $archive_path . "/posting_doc_shards/index{$generation}"; crawlLog("Processing partition {$generation}"); if (file_exists($shard_name)) { crawlLog("..Unlinking old shard {$generation}"); @unlink($shard_name); } $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true); $seen_partition = 0; while ($seen_partition < $partition->count) { $num_to_get = min($partition->count - $seen_partition, 8000); $offset = $partition->iterator_pos; $objects = $partition->nextObjects($num_to_get); $cnt = 0; foreach ($objects as $object) { $cnt++; $site = $object[1]; if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); $link_to = "LINK TO:"; } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $video_sources); $link_to = ""; } $so_far_cnt = $seen_partition + $cnt; $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. "; crawlTimeoutLog($time_out_message); $seen++; $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; } $shard->save(false, true); $generation++; } $this->reindexIndexArchive($archive_path); }
/** * Checks if getScheme is working okay */ function getSchemeTestCase() { $test_links = array(array("http://www.example.com/", "http", "Simple HTTP 1"), array("https://www.example.com/", "https", "Simple HTTPS 1"), array("gopher://www.example.com/", "gopher", "Simple GOPHER 1"), array("./", "http", "Simple HTTP 2")); foreach ($test_links as $test_link) { $result = UrlParser::getScheme($test_link[0]); $this->assertEqual($result, $test_link[1], $test_link[2]); } }
/** * Responsible for handling admin request related to the configure activity * * The configure activity allows a user to set the work directory for * storing data local to this SeekQuarry/Yioop instance. It also allows one * to set the default language of the installation, dbms info, robot info, * test info, as well as which machine acts as the queue server. * * @return array $data fields for available language, dbms, etc as well as * results of processing sub activity if any */ function configure() { $parent = $this->parent; $profile_model = $parent->model("profile"); $group_model = $parent->model("group"); $data = array(); $profile = array(); $data['SYSTEM_CHECK'] = $this->systemCheck(); $languages = $parent->model("locale")->getLocaleList(); foreach ($languages as $language) { $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME']; } if (isset($_REQUEST['lang']) && $_REQUEST['lang']) { $data['lang'] = $parent->clean($_REQUEST['lang'], "string"); $profile['DEFAULT_LOCALE'] = $data['lang']; setLocaleObject($data['lang']); } $data["ELEMENT"] = "configure"; $data['SCRIPT'] = ""; $data['PROFILE'] = false; if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) { if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) { $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY; $_REQUEST['arg'] = "directory"; @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php"); } $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string"); $data['PROFILE'] = true; if (strstr(PHP_OS, "WIN")) { //convert to forward slashes so consistent with rest of code $dir = str_replace("\\", "/", $dir); if ($dir[0] != "/" && $dir[1] != ":") { $data['PROFILE'] = false; } } else { if ($dir[0] != "/") { $data['PROFILE'] = false; } } if ($data['PROFILE'] == false) { $data["MESSAGE"] = tl('system_component_configure_use_absolute_path'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } if (strstr($dir . "/", BASE_DIR . "/")) { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } $data['WORK_DIRECTORY'] = $dir; } else { if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) { $data['WORK_DIRECTORY'] = WORK_DIRECTORY; $data['PROFILE'] = true; } } $arg = ""; if (isset($_REQUEST['arg'])) { $arg = $_REQUEST['arg']; } switch ($arg) { case "directory": if (!isset($data['WORK_DIRECTORY'])) { break; } if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_set'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);"; } else { if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) { if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) { $profile['DBMS'] = 'sqlite3'; $data['DBMS'] = 'sqlite3'; $profile['DB_NAME'] = 'default'; $data['DB_NAME'] = 'default'; $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot'); $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT']; $uri = UrlParser::getPath($_SERVER['REQUEST_URI']); $http = isset($_SERVER['HTTPS']) ? "https://" : "http://"; $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri; $data['NAME_SERVER'] = $profile['NAME_SERVER']; $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time()); $data['AUTH_KEY'] = $profile['AUTH_KEY']; $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus(); $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time(); $profile['ROBOT_INSTANCE'] = $robot_instance; $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE']; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) { if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) { $data["MESSAGE"] = tl('system_component_configure_work_profile_made'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $data['PROFILE'] = true; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_set_config'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_create_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } break; case "profile": $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE')); $data['DEBUG_LEVEL'] = 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0; $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL']; $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); $folder = APP_DIR . "/resources"; if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) { $data["MESSAGE"] = tl('system_component_no_resource_folder'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) { if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") { if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') { $data["MESSAGE"] = tl('system_component_invalid_filetype'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } if ($_FILES[$field]['size'] > THUMB_SIZE) { $data["MESSAGE"] = tl('system_component_file_too_big'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } $profile[$field] = array(); $profile[$field]['name'] = $_FILES[$field]['name']; $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name']; $data[$field] = "./?c=resource&a=get&" . "f=resources&n=" . $profile[$field]['name']; } } if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) { $data['MESSAGE'] = tl('system_component_configure_profile_change'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) { $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&a=configure&" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);"; } } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; case "reset": $base_url = NAME_SERVER; if (defined("BASE_URL")) { $base_url = BASE_URL; } $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => ""); $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); foreach ($old_profile as $key => $value) { $data[$key] = $value; } $tmp_image = $old_profile['BACKGROUND_IMAGE']; $old_profile['BACKGROUND_IMAGE'] = ""; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) { $old_profile['BACKGROUND_IMAGE'] = $tmp_image; foreach ($profile as $key => $value) { $data[$key] = $value; if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") { $resource_name = APP_DIR . "/resources/" . $old_profile[$key]; if (file_exists($resource_name)) { unlink($resource_name); } } } $data['MESSAGE'] = tl('system_component_configure_reset_completed'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; default: if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); } else { $data['WORK_DIRECTORY'] = ""; $data['PROFILE'] = false; } } $data['advanced'] = "false"; if ($data['PROFILE']) { $locale_tag = getLocaleTag(); $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN"); foreach ($not_null_fields as $field => $default) { if (!$data[$field]) { $data[$field] = $default; } } if (isset($_REQUEST['ROBOT_DESCRIPTION'])) { $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN); $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", ""); } $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit"); $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot'); if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') { $data['advanced'] = "true"; } $data['SCRIPT'] .= <<<EOD setDisplay('advance-configure', {$data['advanced']}); setDisplay('advance-robot', {$data['advanced']}); function toggleAdvance() { var advanced = elt('a-settings'); advanced.value = (advanced.value =='true') ? 'false' : 'true'; var value = (advanced.value == 'true') ? true : false; setDisplay('advance-configure', value); setDisplay('advance-robot', value); } EOD; } $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n"; return $data; }
/** * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $base_refs = $xpath->evaluate("/html//base"); if ($base_refs->item(0)) { $tmp_site = $base_refs->item(0)->getAttribute('href'); if (strlen($tmp_site) > 0) { $site = UrlParser::canonicalLink($tmp_site, $site); } } $i = 0; $hrefs = $xpath->evaluate("/html/body//a"); foreach ($hrefs as $href) { if ($i < MAX_LINKS_TO_EXTRACT) { $rel = $href->getAttribute("rel"); if ($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink($href->getAttribute('href'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { $text = $href->nodeValue; if (isset($sites[$url])) { $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } } $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe"); foreach ($frames as $frame) { if ($i < MAX_LINKS_TO_EXTRACT) { $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. HTMLframe"; } else { $sites[$url] = "HTMLframe"; } $i++; } } } $imgs = $xpath->evaluate("/html/body//img[@alt]"); $i = 0; foreach ($imgs as $img) { if ($i < MAX_LINKS_TO_EXTRACT) { $alt = $img->getAttribute('alt'); if (strlen($alt) < 1) { continue; } $url = UrlParser::canonicalLink($img->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. " . $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } return $sites; }
/** * Outputs a profile.php file in the given directory containing profile * data based on new and old data sources * * This function creates a profile.php file if it doesn't exist. A given * field is output in the profile * according to the precedence that a new value is preferred to an old * value is prefered to the value that comes from a currently defined * constant. It might be the case that a new value for a given field * doesn't exist, etc. * * @param string $directory the work directory to output the profile.php * file * @param array $new_profile_data fields and values containing at least * some profile information (only $this->profile_fields * fields of $new_profile_data will be considered). * @param array $old_profile_data fields and values that come from * presumably a previously existing profile * @param bool whether the new profile data is coming from a reset to * factory settings or not */ function updateProfile($directory, $new_profile_data, $old_profile_data, $reset = false) { $n = array(); $n[] = <<<EOT <?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009-2012 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * Computer generated file giving the key defines of directory locations * as well as database settings used to run the SeekQuarry/Yioop search engine * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage config * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009-2012 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} EOT; $base_url = NAME_SERVER; if (defined("BASE_URL")) { $base_url = BASE_URL; } //make sure certain fields are not null $not_null_fields = array('BACKGROUND_COLOR' => "#FFF", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AD_LOCATION' => 'none'); $not_null_keys = array_keys($not_null_fields); $file_fields = $this->file_fields; //now integrate the different profiles foreach ($this->profile_fields as $field) { if (isset($new_profile_data[$field])) { if (!$reset && in_array($field, array('LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH', 'BACKGROUND_IMAGE'))) { if (isset($new_profile_data[$field]['name']) && isset($new_profile_data[$field]['tmp_name'])) { move_uploaded_file($new_profile_data[$field]['tmp_name'], APP_DIR . "/resources/" . $new_profile_data[$field]['name']); $profile[$field] = "./?c=resource&a=get&" . "f=resources&n=" . $new_profile_data[$field]['name']; } else { if (isset($old_profile_data[$field])) { $profile[$field] = $old_profile_data[$field]; } else { if (defined($field)) { $profile[$field] = constant($field); } else { $profile[$field] = ""; } } } } else { $profile[$field] = $new_profile_data[$field]; } } else { if (isset($old_profile_data[$field])) { $profile[$field] = $old_profile_data[$field]; } else { if (defined($field)) { $profile[$field] = constant($field); } else { $profile[$field] = ""; } } } if (!$profile[$field] && isset($not_null_fields[$field])) { $profile[$field] = $not_null_fields[$field]; } if ($field == "NEWS_MODE" && $profile[$field] == "") { $profile[$field] = "news_off"; } if ($field == "WEB_URI") { if (isset($_SERVER['REQUEST_URI'])) { $profile[$field] = UrlParser::getPath($_SERVER['REQUEST_URI']); } else { $profile[$field] = UrlParser::getPath(NAME_SERVER); } } if (in_array($field, $file_fields)) { continue; } if ($field != "DEBUG_LEVEL") { $profile[$field] = "\"{$profile[$field]}\""; } $n[] = "define('{$field}', {$profile[$field]});"; } $out = implode("\n", $n); if (file_put_contents($directory . PROFILE_FILE_NAME, $out) !== false) { restore_error_handler(); @chmod($directory . PROFILE_FILE_NAME, 0777); if (isset($new_profile_data['AUXILIARY_CSS'])) { if (!file_exists(APP_DIR . "/css")) { @mkdir(APP_DIR . "/css"); @chmod(APP_DIR . "/css", 0777); } $css_file = APP_DIR . "/css/auxiliary.css"; file_put_contents($css_file, $new_profile_data['AUXILIARY_CSS']); @chmod($css_file, 0777); } set_error_handler("yioop_error_handler"); return true; } return false; }
/** * Handles admin request related to the search sources activity * * The search sources activity allows a user to add/delete search sources * for video and news, it also allows a user to control which subsearches * appear on the SearchView page * * @return array $data info about current search sources, and current * sub-searches */ function searchSources() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $source_model = $parent->model("source"); $possible_arguments = array("addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch"); $data = array(); $data["ELEMENT"] = "searchsources"; $data['SCRIPT'] = ""; $data['SOURCE_TYPES'] = array(-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "html" => tl('crawl_component_html_feed')); $source_type_flag = false; if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) { $data['SOURCE_TYPE'] = $_REQUEST['type']; $source_type_flag = true; } else { $data['SOURCE_TYPE'] = -1; } $machine_urls = $parent->model("machine")->getQueueServerUrls(); $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls); $data["SEARCH_LISTS"] = array(-1 => tl('crawl_component_sources_indexes')); foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["i:" . $item["CRAWL_TIME"]] = $item["DESCRIPTION"]; } if (isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; } else { $user = $_SERVER['REMOTE_ADDR']; } $search_lists = $crawl_model->getMixList($user); foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["m:" . $item["TIMESTAMP"]] = $item["NAME"]; } $n = NUM_RESULTS_PER_PAGE; $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n); if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) { $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page']; } else { $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE; } $locales = $parent->model("locale")->getLocaleList(); $data["LANGUAGES"] = array(); foreach ($locales as $locale) { $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME']; } if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) { $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language']; } else { $data['SOURCE_LOCALE_TAG'] = DEFAULT_LOCALE; } $data["CURRENT_SOURCE"] = array("name" => "", "type" => $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'channel_path' => "", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']); $data["CURRENT_SUBSEARCH"] = array("locale_string" => "", "folder_name" => "", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']); $data['SOURCE_FORM_TYPE'] = "addsource"; $data["SEARCH_FORM_TYPE"] = "addsubsearch"; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "addsource": if (!$source_type_flag) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_source_type') . "</h1>');"; break; } $must_have = array("name", "type", 'source_url'); $is_html_feed = false; if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'html') { $is_html_feed = true; $must_have = array_merge($must_have, array('channel_path', 'item_path', 'title_path', 'description_path', 'link_path')); } $to_clean = array_merge($must_have, array('aux_info', 'language')); foreach ($to_clean as $clean_me) { $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if ($clean_me == "source_url") { $r[$clean_me] = UrlParser::canonicalLink($r[$clean_me], NAME_SERVER); echo $r[$clean_me] . "\n"; if (!$r[$clean_me]) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_invalid_url') . "</h1>');"; break 2; } } if (in_array($clean_me, $must_have) && $r[$clean_me] == "") { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');"; break 2; } } if ($is_html_feed) { $r['aux_info'] = $r['channel_path'] . "###" . $r['item_path'] . "###" . $r['title_path'] . "###" . $r['description_path'] . "###" . $r['link_path']; } $source_model->addMediaSource($r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_added') . "</h1>');"; break; case "addsubsearch": $to_clean = array("folder_name", 'index_identifier'); $must_have = $to_clean; foreach ($to_clean as $clean_me) { $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');"; break 2; } } $source_model->addSubsearch($r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_added') . "</h1>');"; break; case "deletesource": if (!isset($_REQUEST['ts'])) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');"; break; } $timestamp = $parent->clean($_REQUEST['ts'], "string"); $source_model->deleteMediaSource($timestamp); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_deleted') . "</h1>');"; break; case "deletesubsearch": if (!isset($_REQUEST['fn'])) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');"; break; } $folder_name = $parent->clean($_REQUEST['fn'], "string"); $source_model->deleteSubsearch($folder_name); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_deleted') . "</h1>');"; break; case "editsubsearch": $data['SEARCH_FORM_TYPE'] = "editsubsearch"; $subsearch = false; $folder_name = isset($_REQUEST['fn']) ? $parent->clean($_REQUEST['fn'], "string") : ""; if ($folder_name) { $subsearch = $source_model->getSubsearch($folder_name); } if (!$subsearch) { $data['SOURCE_FORM_TYPE'] = "addsubsearch"; break; } $data['fn'] = $folder_name; $update = false; foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $subsearch[$upper_field] = $parent->clean($_REQUEST[$field], "string"); $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; $update = true; } else { if (isset($subsearch[$upper_field])) { $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; } } } if ($update) { $source_model->updateSubsearch($subsearch); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_updated') . "</h1>');"; } break; case "editsource": $data['SOURCE_FORM_TYPE'] = "editsource"; $source = false; $timestamp = isset($_REQUEST['ts']) ? $parent->clean($_REQUEST['ts'], "string") : ""; if ($timestamp) { $source = $source_model->getMediaSource($timestamp); } if (!$source) { $data['SOURCE_FORM_TYPE'] = "addsource"; break; } $data['ts'] = $timestamp; $update = false; $is_html_feed = false; if ($source['TYPE'] == 'html') { $is_html_feed = true; list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = explode("###", $source['AUX_INFO']); } foreach ($data['CURRENT_SOURCE'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $source[$upper_field] = $parent->clean($_REQUEST[$field], "string"); $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; $update = true; } else { if (isset($source[$upper_field])) { $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; } } } if ($update) { if ($is_html_feed) { $source['AUX_INFO'] = $source['CHANNEL_PATH'] . "###" . $source['ITEM_PATH'] . "###" . $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###" . $source['LINK_PATH']; } unset($source['CHANNEL_PATH']); unset($source['ITEM_PATH']); unset($source['TITLE_PATH']); unset($source['DESCRIPTION_PATH']); unset($source['LINK_PATH']); $source_model->updateMediaSource($source); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_updated') . "</h1>');"; } break; } } $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity($_SESSION['USER_ID'], "manageLocales"); $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("NAME", "", "", "ASC"))); $parent->pagingLogic($data, $source_model, "SUBSEARCHES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("FOLDER_NAME", "", "", "ASC")), "SUB", "SUBSEARCH"); foreach ($data["SUBSEARCHES"] as $search) { if (!isset($data["SEARCH_LISTS"][trim($search['INDEX_IDENTIFIER'])])) { $source_model->deleteSubsearch($search["FOLDER_NAME"]); } } $data['SCRIPT'] .= "source_type = elt('source-type');" . "source_type.onchange = switchSourceType;" . "switchSourceType()"; return $data; }
/** * Checks if the $url is from a site which has an hourly quota to download. * If so, it bumps the quota count and return true; false otherwise. * This method also resets the quota queue every over * * @param string $url to check if within quota * @return bool whether $url exceeds the hourly quota of the site it is from */ function withinQuota($url) { if (!($site = UrlParser::urlMemberSiteArray($url, $this->quota_sites_keys, "q" . $this->allow_disallow_cache_time, true))) { return true; } list($quota, $current_count) = $this->quota_sites[$site]; if ($current_count < $quota) { $this->quota_sites[$site] = array($quota, $current_count + 1); $flag = true; } else { $flag = false; } if ($this->quota_clear_time + ONE_HOUR < time()) { $this->quota_clear_time = time(); foreach ($this->quota_sites as $site => $info) { list($quota, ) = $info; $this->quota_sites[$site] = array($quota, 0); } } return $flag; }
/** * Used to handle data from the suggest-a-url to crawl form * (suggest_view.php). Basically, it saves any data submitted to * a file which can then be imported in manageCrawls * * @return array $data contains fields with the current value for * the url (if set but not submitted) as well as for a captcha */ function suggestUrl() { $data["REFRESH"] = "suggest"; $visitor_model = $this->model("visitor"); $clear = false; if (CAPTCHA_MODE != IMAGE_CAPTCHA) { unset($_SESSION["captcha_text"]); } if (CAPTCHA_MODE != TEXT_CAPTCHA) { unset($_SESSION['CAPTCHA']); unset($_SESSION['CAPTCHA_ANSWERS']); } if (CAPTCHA_MODE != HASH_CAPTCHA) { $num_captchas = self::NUM_CAPTCHA_QUESTIONS; unset($_SESSION["request_time"]); unset($_SESSION["level"]); unset($_SESSION["random_string"]); } else { $data['INCLUDE_SCRIPTS'] = array("sha1", "hash_captcha"); } if (!isset($_SESSION['BUILD_TIME']) || !isset($_REQUEST['build_time']) || $_SESSION['BUILD_TIME'] != $_REQUEST['build_time'] || $this->clean($_REQUEST['build_time'], "int") <= 0) { if (CAPTCHA_MODE == HASH_CAPTCHA) { $time = time(); $_SESSION["request_time"] = $time; $_SESSION["level"] = self::HASH_CAPTCHA_LEVEL; $_SESSION["random_string"] = md5($time . AUTH_KEY); } $clear = true; if (isset($_REQUEST['url'])) { unset($_REQUEST['url']); } if (isset($_REQUEST['arg'])) { unset($_REQUEST['arg']); } $data['build_time'] = time(); $_SESSION['BUILD_TIME'] = $data['build_time']; } else { $data['build_time'] = $_SESSION['BUILD_TIME']; } $data['url'] = ""; if (isset($_REQUEST['url'])) { $data['url'] = $this->clean($_REQUEST['url'], "string"); } $missing = array(); $save = isset($_REQUEST['arg']) && $_REQUEST['arg']; if (CAPTCHA_MODE == TEXT_CAPTCHA) { for ($i = 0; $i < $num_captchas; $i++) { $data["question_{$i}"] = "-1"; if ($clear && isset($_REQUEST["question_{$i}"])) { unset($_REQUEST["question_{$i}"]); } } if (!isset($_SESSION['CAPTCHA']) || !isset($_SESSION['CAPTCHA_ANSWERS'])) { list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES); $data['CAPTCHA'] = $captchas; $data['build_time'] = time(); $_SESSION['BUILD_TIME'] = $data['build_time']; $_SESSION['CAPTCHA_ANSWERS'] = $answers; $_SESSION['CAPTCHA'] = $data['CAPTCHA']; } else { $data['CAPTCHA'] = $_SESSION['CAPTCHA']; } for ($i = 0; $i < $num_captchas; $i++) { $field = "question_{$i}"; $captchas = isset($_SESSION['CAPTCHA'][$i]) ? $_SESSION['CAPTCHA'][$i] : array(); if ($save) { if (!isset($_REQUEST[$field]) || $_REQUEST[$field] == "-1" || !in_array($_REQUEST[$field], $captchas)) { $missing[] = $field; } else { $data[$field] = $_REQUEST[$field]; } } } } $data['MISSING'] = $missing; $fail = false; if (CAPTCHA_MODE == IMAGE_CAPTCHA && !$save) { $this->setupGraphicalCaptchaViewData($data); } if ($save && isset($_REQUEST['url'])) { $url = $this->clean($_REQUEST['url'], "string"); $url_parts = @parse_url($url); if (!isset($url_parts['scheme'])) { $url = "http://" . $url; } $suggest_host = UrlParser::getHost($url); $scheme = UrlParser::getScheme($url); if (strlen($suggest_host) < 12 || !$suggest_host || !in_array($scheme, array("http", "https"))) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_invalid_url') . "</h1>');"; $fail = true; } else { if ($missing != array()) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_error_fields') . "</h1>');"; $fail = true; } } if (CAPTCHA_MODE == IMAGE_CAPTCHA && $fail) { $this->setupGraphicalCaptchaViewData($data); } if ($fail) { return $data; } switch (CAPTCHA_MODE) { case HASH_CAPTCHA: if (!$this->validateHashCode()) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_hashcode') . "</h1>');"; $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out"); return $data; } break; case TEXT_CAPTCHA: $fail = false; if (!$this->checkCaptchaAnswers()) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_human') . "</h1>');"; $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out"); $data['build_time'] = time(); $_SESSION['BUILD_TIME'] = $data['build_time']; $fail = true; } for ($i = 0; $i < $num_captchas; $i++) { $data["question_{$i}"] = "-1"; } list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES); $data['CAPTCHA'] = $captchas; $_SESSION['CAPTCHA_ANSWERS'] = $answers; $_SESSION['CAPTCHA'] = $data['CAPTCHA']; if ($fail) { return $data; } break; case IMAGE_CAPTCHA: $user_captcha_text = isset($_REQUEST['user_captcha_text']) ? $this->clean($_REQUEST['user_captcha_text'], "string") : ""; if (isset($_SESSION['captcha_text']) && $_SESSION['captcha_text'] != trim($user_captcha_text)) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_graphical_human') . "</h1>');"; unset($_SESSION['captcha_text']); $this->setupGraphicalCaptchaViewData($data); $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out"); return $data; } $this->setupGraphicalCaptchaViewData($data); break; } // Handle cases where captcha was okay if (!$this->model("crawl")->appendSuggestSites($url)) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_suggest_full') . "</h1>');"; return $data; } $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_url_submitted') . "</h1>');"; $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "suggest_day_exceeded", ONE_DAY, ONE_DAY, MAX_SUGGEST_URLS_ONE_DAY); $data['build_time'] = time(); $_SESSION['BUILD_TIME'] = $data['build_time']; $data['url'] = ""; } return $data; }
/** * Checks if the given $url is allowed to be crawled based on stored * robots.txt info. * @param string $url to check * @return bool whether it was allowed or not */ function checkRobotOkay($url) { // local cache of recent robot.txt stuff static $robot_cache = array(); $cache_size = 2000; list($host, $path) = UrlParser::getHostAndPath($url, true, true); $path = urldecode($path); $key = crawlHash($host, true); if (isset($robot_cache[$key])) { $robot_object = $robot_cache[$key]; } else { $data = $this->robot_table->lookup($key); $offset = unpackInt($data); $robot_object = $this->robot_archive->getObjects($offset, 1); $robot_cache[$key] = $robot_object; if (count($robot_cache) > $cache_size) { array_shift($robot_cache); } } $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array(); //these should have been urldecoded in RobotProcessor $robots_okay = true; $robots_not_okay = false; if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) { $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]); $robots_okay = !$robots_not_okay; } if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) { $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]); } return $robots_okay || !$robots_not_okay; }
/** * This method adds robots metas to or removes entirely a summary * produced by a text page processor or its subsclasses depending on * whether the summary title and description satisfy various rules * in $this->filter_rules * * @param array& $summary the summary data produced by the relevant page * processor's handle method; modified in-place. * @param string $url the url where the summary contents came from */ function pageSummaryProcessing(&$summary, $url) { $sites = array_keys($this->filter_rules); $filter_rules = $this->filter_rules; $rules = $filter_rules['default'] ? $filter_rules['default'] : array(); foreach ($sites as $site) { if ($site == "default") { continue; } $sign = $site[0] == '-' ? false : true; if (!$sign || $site[0] == '+') { $check_url = substr($site, 1); } else { $check_url = $site; } if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) { $rules = array_merge($rules, $filter_rules[$site]); } } foreach ($rules as $rule) { $preconditions = $rule["PRECONDITIONS"]; $actions = $rule["ACTIONS"]; $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]); if ($filter_flag) { if (in_array("NOPROCESS", $actions)) { crawlLog(" Word filter plugin removed page."); $summary = false; break; } else { if (!isset($summary[self::ROBOT_METAS])) { $summary[self::ROBOT_METAS] = array(); } $summary[self::ROBOT_METAS] += $actions; } } } }
/** * Used to determine if an action involves just one yioop instance on * the current local machine or not * * @param array $machine_urls urls of yioop instances to which the action * applies * @param string $index_timestamp if timestamp exists checks if the index * has declared itself to be a no network index. * @return bool whether it involves a single local yioop instance (true) * or not (false) */ function isSingleLocalhost($machine_urls, $index_timestamp = -1) { if ($index_timestamp >= 0) { $index_archive_name = self::index_data_base_name . $index_timestamp; if (file_exists(CRAWL_DIR . "/cache/{$index_archive_name}/no_network.txt")) { return true; } } return count($machine_urls) <= 1 && UrlParser::isLocalhostUrl($machine_urls[0]); }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $sit a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink"; $i = 0; $relationships = $dom->getElementsByTagName("Relationships"); foreach ($relationships as $relationship) { $relations = $relationship->getElementsByTagName("Relationship"); foreach ($relations as $relation) { if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) { if ($i < MAX_LINKS_TO_EXTRACT) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink($link, $site); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) { if (isset($sites[$url])) { $sites[$url] .= " " . $link; } else { $sites[$url] = $link; } $i++; } } } } } return $sites; }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n p:txBody//a:p//a:r//a:rPr//a:hlinkClick"); $i = 0; foreach ($paras as $para) { if ($i < MAX_LINKS_TO_EXTRACT) { $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue; $url = UrlParser::canonicalLink($hlink, $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) { if (isset($sites[$url])) { $sites[$url] .= " " . $hlink; } else { $sites[$url] = $hlink; } } } $i++; } return $sites; }
/** * Tries to determine the language of the document by looking at the * $sample_text and $url provided * the language * @param string $sample_text sample text to try guess the language from * @param string $url url of web-page as a fallback look at the country * to figure out language * * @return string language tag for guessed language */ static function calculateLang($sample_text = NULL, $url = NULL) { if ($url != NULL) { $lang = UrlParser::getDocumentType($url); } return $lang; }
/** * Returns links from the supplied dom object of a sitemap * where links have been canonicalized according to * the supplied $site information. We allow more links from a sitemap * than from other kinds of documents. For now we are ignoring weighting * info * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9"); $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc"); $i = 0; foreach ($paths as $path) { $nodes = @$xpath->evaluate($path); foreach ($nodes as $node) { $url = UrlParser::canonicalLink($node->textContent, $site); if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) { //at this point we can't handle gzip'd sitemaps continue; } $sites[$url] = "From sitemap of " . $site; $i++; if ($i > MAX_LINKS_PER_SITEMAP) { break 2; } } } return $sites; }
/** * Parses argument data * * @param array $data */ private function parseData($data = array()) { // show help if requested and exit! if (isset($data['help'])) { require_once MANGA_ROOT_DIR . 'includes/templates/help/index.php'; exit; } $data = is_array($data) ? $data : array(); // image delay $this->setImageDelay(Input::array_value($data, 'image-delay', '', 'trim')); // chapter delay $this->setChapterDelay(Input::array_value($data, 'chapter-delay', '', 'trim')); // url if (isset($data['url'])) { $url = trim($data['url']); if ($url == '') { consoleLineError("Url parameter cannot be empty!"); exit; } $parsedData = UrlParser::parseUrl($url); if (!$parsedData) { consoleLineError("Provided url is not is not valid!"); exit; } else { $data['source'] = $parsedData['source']; $data['slug'] = $parsedData['slug']; $chapter = trim($parsedData['chapter']); if ($chapter != '') { $data['chapter-ids'] = $chapter; $data['action'] = self::ACTION_SPECIFIC_CHAPTERS; } } } // check for valid params $dataKeys = array_keys($data); $diff = array_diff($dataKeys, $this->_allowed_param_names); if (count($diff) > 0) { consoleLineError("Invalid params: " . join(',', $diff), 2); exit; } $this->_argumentsList = $data; // action $action = Input::array_value($data, 'action', '', 'trim'); if ($action == '') { $action = self::ACTION_NEW_CHAPTERS; } if (!$this->isValidAction($action)) { $this->displayInvalidActionMessage(TRUE); } else { $this->_action = $action; if ($this->_action == self::ACTION_SPECIFIC_CHAPTERS) { $chapterIds = Input::array_value($data, 'chapter-ids', '', 'trim'); if ($chapterIds == '') { consoleLineError('One or more chapter ids are required when action is "' . self::ACTION_SPECIFIC_CHAPTERS . '"'); Console::emptyLines(); exit; } } } // source $source = Input::array_value($data, 'source', MangaSourceList::SOUCE_MANGAPANDA, 'trim'); if (MangaSourceList::getInstance()->isValidSource($source)) { $this->_source = $source; } else { MangaSourceList::getInstance()->displayInvalidMangaSourceMessage(TRUE); } // slug $slug = Input::array_value($data, 'slug', '', 'trim'); if ($slug == '') { consoleLineError('Manga slug is required!', 2); consoleLinePurple('Example: --slug=nisekoi', 2); Console::writeMultiline('Slug usualy means the SEO friendly name of the manga. But it can be different for different manga sources.The slug is part of the manga chapters list url.'); consoleLineInfo(''); exit; } $this->_mangaSlug = $slug; // name $name = Input::array_value($data, 'name', '', 'trim'); if ($name == '') { $name = $this->_mangaSlug; } $this->_mangaName = $name; // Output dir $output_dir = Input::array_value($data, 'output-dir', '', 'trim'); if ($output_dir == '') { $output_dir = './manga/' . $this->_source . '/' . $this->_mangaSlug . '/'; } if (!is_dir($output_dir)) { if (!mkdir($output_dir, 0777, TRUE)) { consoleLineError("Unable to create output dir: " . $output_dir, 2); consoleLineInfo(''); exit; } } else { $tmpFile = tempnam($output_dir, 'mst-'); if (!fopen($tmpFile, 'w')) { consoleLineError("Output dir is not writeable!" . $output_dir, 2); consoleLineInfo(''); exit; } else { @unlink($tmpFile); } } $this->_output_dir = $output_dir; # chapters count $chaptersCount = Input::array_value_as_int($data, 'chapters-count', 0); if ($chaptersCount < 0) { $chaptersCount = 0; } $this->_chapters_count = $chaptersCount; # chapter ids $chapterIds = Input::array_value($data, 'chapter-ids', '', 'trim'); if ($chapterIds == '') { $this->_chapter_ids = array(); } else { // is it a file? if (is_readable($chapterIds)) { $chapterIds = trim(file_get_contents($chapterIds)); } $chapterIds = explode(',', $chapterIds); $chapterIds = array_map('trim', $chapterIds); // check for ranges $chapterRangesIds = array(); foreach ($chapterIds as $k => $v) { $cid = $chapterIds[$k]; if (preg_match('/([0-9.]+)\\s*-\\s*([0-9.]+)/im', $cid, $regs)) { $chapterRangesIds[$k] = array('start' => $regs[1], 'end' => $regs[2]); } } if (count($chapterRangesIds) > 0) { // unset the range format entries first, as we are gonna get real // chapter ids from that range next foreach ($chapterRangesIds as $k => $rangeData) { unset($chapterIds[$k]); } // get available chapters from ranges foreach ($chapterRangesIds as $k => $rangeData) { $start = $rangeData['start']; $end = $rangeData['end']; for ($i = $start; $i <= $end; $i += 1) { $chapterIds[] = $i; } } } asort($chapterIds); $chapterIds = array_unique($chapterIds); $this->_chapter_ids = $chapterIds; } # create cbr $createCbr = isset($data['create-cbr']) ? $data['create-cbr'] : TRUE; $result = strtolower(exec('type -p rar')); if (strpos($result, 'not found')) { consoleLineError('rar doesnt seem to be installed in the system!'); $createCbr = FALSE; } $this->_create_cbr = $createCbr; if (!$this->_create_cbr) { consoleLineError('.cbr files will not be created!'); } # no cbr backup if ($this->_action == self::ACTION_RECREATE_CBR) { $this->_no_cbr_backup = isset($data['no-cbr-backup']) && $data['no-cbr-backup']; } }
/** * Tries to determine the language of the document by looking at the * $sample_text and $url provided * the language * @param string $sample_text sample text to try guess the language from * @param string $url url of web-page as a fallback look at the country * to figure out language * * @return string language tag for guessed language */ static function calculateLang($sample_text = NULL, $url = NULL) { if ($url != NULL) { $lang = UrlParser::getLang($url); if ($lang != NULL) { return $lang; } } if ($sample_text != NULL) { $words = mb_split("[[:space:]]|" . PUNCT, $sample_text); $num_words = count($words); $ascii_count = 0; foreach ($words as $word) { if (strlen($word) == mb_strlen($word)) { $ascii_count++; } } // crude, but let's guess ASCII == english if ($ascii_count / $num_words > EN_RATIO) { $lang = 'en'; } else { $lang = NULL; } } else { $lang = NULL; } return $lang; }
<?php require_once './define.php'; require_once DIREC . '/common/common.php'; $parse = new UrlParser(); $url = $parse->urlParse($_SERVER['PATH_INFO']); $main = new MainController(); $main->run($url);
/** * Implements post processing of recipes. recipes are extracted * ingredients are scrubbed and recipes are clustered. The clustered * recipes are added back to the index. * * @param string $index_name index name of the current crawl. */ function postProcessing($index_name) { global $INDEXING_PLUGINS; if (!class_exists("SplHeap")) { crawlLog("...Recipe Plugin Requires SPLHeap for clustering!"); crawlLog("...Aborting plugin"); return; } $locale_tag = guessLocale(); setLocaleObject($locale_tag); $search_controller = new SearchController($INDEXING_PLUGINS); $query = "recipe:all i:{$index_name}"; crawlLog("...Running Recipe Plugin!"); crawlLog("...Finding docs tagged as recipes."); $more_docs = true; $raw_recipes = array(); $limit = 0; $num = 100; while ($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . "."); $limit += $num_results; if (isset($results["SAVE_POINT"])) { $end = true; foreach ($results["SAVE_POINT"] as $save_point) { if ($save_point != -1) { $end = false; } } if ($end) { $more_docs = false; } } else { $more_docs = false; } } crawlLog("...Clustering."); // only cluster if would make more than one cluster if (count($raw_recipes) * CLUSTER_RATIO > 1) { $recipes = array(); $i = 0; foreach ($raw_recipes as $raw_recipe) { $description = $raw_recipe[self::DESCRIPTION]; $ingredients = explode("||", $description); if (is_array($ingredients) && count($ingredients) > 1) { $recipes[$i][0] = $raw_recipe[self::TITLE]; $recipes[$i][1] = $ingredients; $recipes[$i][2] = crawlHash($raw_recipe[self::URL]); $recipes[$i][3] = $raw_recipe; $i++; } } $recipes_ingredients = array(); $count = count($recipes); foreach ($recipes as $key => $recipe) { foreach ($recipe[1] as $index => $ingredient) { if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") { $mainIngredient = $this->getIngredientName((string) $ingredient); if (strlen($mainIngredient) != 0) { $recipe[1][$index] = $mainIngredient; } else { unset($recipe[1][$index]); } } else { unset($recipe[1][$index]); } } $recipes[$key] = $recipe; } $count = count($recipes); $k = 0; $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray'); for ($i = 0; $i < $count; $i++) { $recipe1_main_ingredient = ""; $recipe1 = $recipes[$i][1]; $recipe_name = $recipes[$i][0]; $recipe1_title = strtolower($recipes[$i][0]); $distinct_ingredients[$recipe_name] = $recipes[$i][1]; $doc_keys[$recipe_name] = $recipes[$i][2]; $recipes_summary[$recipe_name] = $recipes[$i][3]; for ($j = $i + 1; $j < $count; $j++) { $recipe2_main_ingredient = ""; $recipe2 = $recipes[$j][1]; $recipe2_title = strtolower($recipes[$j][0]); $weights[$k][0] = $recipes[$i][0]; $weights[$k][1] = $recipes[$j][0]; $merge_array = array_merge($recipe1, $recipe2); $vector_array = array_unique($merge_array); sort($vector_array); $recipe1_vector = array_fill_keys($vector_array, 0); $recipe2_vector = array_fill_keys($vector_array, 0); foreach ($recipe1 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; } } $recipe1_vector[$ingredient] = 1; } foreach ($recipe2 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe2_title, $ingredient)) { $recipe2_main_ingredient = $ingredient; } } $recipe2_vector[$ingredient] = 1; } $edge_weight = 0; $matches = 1; foreach ($vector_array as $vector) { $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector]; $vector_diff[$vector] = pow($diff, 2); if (abs($diff) == 1) { $matches += 1; } $edge_weight += $vector_diff[$vector]; } $main_ingredient_match = 1; if ($recipe1_main_ingredient != $recipe2_main_ingredient) { $main_ingredient_match = 1000; } $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match; $weights[$k][2] = $edge_weight; $k++; } } crawlLog("...Making new shard with clustered recipes as docs."); $clusters = kruskalClustering($weights, $count, $distinct_ingredients); $index_shard = new IndexShard("cluster_shard"); $word_lists = array(); $recipe_sites = array(); foreach ($clusters as $cluster) { $count = count($cluster); for ($i = 0; $i < $count - 1; $i++) { $meta_ids = array(); $summary = array(); $recipe = $cluster[$i]; $summary[self::URL] = $recipes_summary[$recipe][self::URL]; $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE]; $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION]; $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP]; $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING]; $summary[self::HASH] = $recipes_summary[$recipe][self::HASH]; $doc_keys[$recipe] = crawlHash($summary[self::URL], true); $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1); $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE]; $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE]; $recipe_sites[] = $summary; $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]); crawlLog("ingredient:" . $cluster["ingredient"]); if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) { crawlLog("Problem inserting recipe: " . $summary[self::TITLE]); } } } $shard_string = $index_shard->save(true); $index_shard = IndexShard::load("cluster_shard", $shard_string); unset($shard_string); crawlLog("...Adding recipe shard to index archive bundle"); $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle($dir, false); if ($index_shard->word_docs_packed) { $index_shard->unpackWordDocs(); } $generation = $index_archive->initGenerationToAdd($index_shard); if (isset($recipe_sites)) { crawlLog("... Adding " . count($recipe_sites) . " recipe docs."); $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0); } $k = 0; foreach ($recipe_sites as $site) { $recipe = $site[self::TITLE]; $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1); $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } $index_shard->changeDocumentOffsets($summary_offsets); $index_archive->addIndexData($index_shard); $index_archive->saveAndAddCurrentShardDictionary(); $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name); crawlLog("...Recipe plugin finished."); } }
require_once(AT_INCLUDE_PATH.'lib/constants.inc.php'); require_once(AT_INCLUDE_PATH.'lib/mysql_connect.inc.php'); //require_once(AT_INCLUDE_PATH.'vitals.inc.php'); //mimic config variables, vitals.inc.php 135-140 /* get config variables. if they're not in the db then it uses the installation default value in constants.inc.php */ $sql = "SELECT * FROM ".TABLE_PREFIX."config"; $result = mysql_query($sql, $db); while ($row = mysql_fetch_assoc($result)) { $_config[$row['name']] = $row['value']; } //Get path info $pathinfo = getPathInfo(); $url_parser = new UrlParser($pathinfo); $path_array = $url_parser->getPathArray(); $_pretty_url_course_id = $path_array[0]; $obj = $path_array[1]; if (!$obj->isEmpty()){ /* * Addresses the issue for relative uri * @refer to constants.inc.php $_rel_link */ $_rel_url = $obj->redirect(); $var_query = $obj->parsePrettyQuery(); save2Get($var_query); //remake all the _GET and _REQUEST variables so that the vitals can use it $_user_location = ''; //reset user_location so that the vital file in each page would validate $pretty_current_page = $obj->getPage();
/** * 在这个方法内定义是否分析处理该 url 内容中的链接 * @param string $url * @return boolean */ protected function isFollowUrl($url) { return parent::isFollowUrl($url); }
/** * Parses the contents of a robots.txt page extracting allowed, * disallowed paths, crawl-delay, and sitemaps. We also extract a * list of all user agent strings seen. * * @param string $page text string of a document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description, links, and content) of * the information in $page */ function process($page, $url) { $summary = NULL; $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = ""; $summary[self::LANG] = NULL; $summary[self::ROBOT_PATHS] = array(); $summary[self::AGENT_LIST] = array(); $summary[self::LINKS] = array(); $host_url = UrlParser::getHost($url); $lines = explode("\n", $page); $add_rule_state = false; $rule_added_flag = false; $delay_flag = false; $delay = 0; foreach ($lines as $pre_line) { $pre_line_parts = explode("#", $pre_line); $line = $pre_line_parts[0]; $line_parts = explode(":", $line); if (!isset($line_parts[1])) { continue; } $field = array_shift($line_parts); $value = implode(":", $line_parts); //notice we lower case field, so switch below is case insensitive $field = strtolower(trim($field)); $value = trim($value); $specificness = 0; if (strlen($value) == 0) { continue; } switch ($field) { case "user-agent": //we allow * in user agent string $summary[self::AGENT_LIST][] = $value; $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0; if ($current_specificness < $specificness) { break; } if ($specificness < $current_specificness) { //Give precedence to exact match on agent string $specificness = $current_specificness; $add_rule_state = true; $summary[self::ROBOT_PATHS] = array(); break; } $agent_parts = explode("*", $value); $offset = 0; $add_rule_state = true; foreach ($agent_parts as $part) { if ($part == "") { continue; } $new_offset = stripos(USER_AGENT_SHORT, $part, $offset); if ($new_offset === false) { $add_rule_state = false; break; } $offset = $new_offset; } break; case "sitemap": $tmp_url = UrlParser::canonicalLink($value, $host_url); if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) { $summary[self::LINKS][] = $tmp_url; } break; case "allow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "disallow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "crawl-delay": if ($add_rule_state) { $delay_flag = true; $delay = max($delay, intval($value)); } break; } } if ($delay_flag) { if ($delay > MAXIMUM_CRAWL_DELAY) { $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/"; } else { $summary[self::CRAWL_DELAY] = $delay; } } $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>"; return $summary; }
/** * Builds an inverted index shard (word --> {docs it appears in}) * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages. * This inverted index shard is then merged by a queue_server * into the inverted index of the current generation of the crawl. * The complete inverted index for the whole crawl is built out of these * inverted indexes for generations. The point of computing a partial * inverted index on the fetcher is to reduce some of the computational * burden on the queue server. The resulting mini index computed by * buildMiniInvertedIndex() is stored in * $this->found_sites[self::INVERTED_INDEX] * */ function buildMiniInvertedIndex() { $start_time = microtime(); $keypad = ""; crawlLog(" Start building mini inverted index ... Current Memory:" . memory_get_usage()); $num_seen = count($this->found_sites[self::SEEN_URLS]); $this->num_seen_sites += $num_seen; /* for the fetcher we are not saving the index shards so name doesn't matter. */ if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) { $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}"); } for ($i = 0; $i < $num_seen; $i++) { $interim_time = microtime(); $site = $this->found_sites[self::SEEN_URLS][$i]; if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) { continue; } $doc_rank = false; if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) { $doc_rank = $this->archive_iterator->weight($site); } if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") { $is_link = true; $doc_keys = $site[self::HTTP_CODE]; $site_url = $site[self::TITLE]; $host = UrlParser::getHost($site_url); $link_parts = explode('|', $site[self::HASH]); if (isset($link_parts[5])) { $link_origin = $link_parts[5]; } else { $link_origin = $site_url; } $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources); } $word_lists = array(); /* self::JUST_METAS check to avoid getting sitemaps in results for popular words */ $lang = NULL; if (!isset($site[self::JUST_METAS])) { $host_words = UrlParser::getWordsIfHostUrl($site_url); $path_words = UrlParser::getWordsLastPathPartUrl($site_url); if ($is_link) { $phrase_string = $site[self::DESCRIPTION]; } else { if (isset($site[self::LANG])) { if (isset($this->programming_language_extension[$site[self::LANG]])) { $phrase_string = $site[self::DESCRIPTION]; } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } else { $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION]; } } if (isset($site[self::LANG])) { $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]); } $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { $meta_ids[] = "safe:false"; $safe = false; } } if (!$is_link) { //store inlinks so they can be searched by $num_links = count($site[self::LINKS]); if ($num_links > 0) { $link_rank = false; if ($doc_rank !== false) { $link_rank = max($doc_rank - 1, 1); } } else { $link_rank = false; } } $num_queue_servers = count($this->queue_servers); if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) { $score_keys = ""; foreach ($site[self::USER_RANKS] as $label => $score) { $score_keys .= packInt($score); } if (strlen($score_keys) % 8 != 0) { $score_keys .= $keypad; } $doc_keys .= $score_keys; } $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear in what indexing, so don't index again. $site[self::JUST_META] is set when have a sitemap or robots.txt (this case set later). In this case link info is not particularly useful for indexing and can greatly slow building inverted index. */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's done elsewhere */ if (strlen($url) == 0 || is_numeric($url)) { continue; } $link_host = UrlParser::getHost($url); if (strlen($link_host) == 0) { continue; } $part_num = calculatePartition($link_host, $num_queue_servers); $summary = array(); if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) { $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array(); } $elink_flag = $link_host != $host ? true : false; $link_text = strip_tags($link_text); $ref = $elink_flag ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url; $elink_flag_string = $elink_flag ? "e" : "i"; $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; $summary[self::ENCODING] = $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) { $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}"); } $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank); } } $iterim_elapse = changeInMicrotime($interim_time); if ($iterim_elapse > 5) { crawlLog("..Inverting " . $site[self::URL] . "...took > 5s."); } crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]); } if ($this->crawl_type == self::ARCHIVE_CRAWL) { $this->recrawl_check_scheduler = true; } crawlLog(" Build mini inverted index time " . changeInMicrotime($start_time)); }
/** * Used to compute all the meta ids for a given link with $url * and $link_text that was on a site with $site_url. * * @param string $url url of the link * @param string $link_host url of the host name of the link * @param string $link_text text of the anchor tag link came from * @param string $site_url url of the page link was on */ static function calculateLinkMetas($url, $link_host, $link_text, $site_url) { global $IMAGE_TYPES; $link_meta_ids = array(); if (strlen($link_host) == 0) { continue; } if (substr($link_text, 0, 9) == "location:") { $location_link = true; $link_meta_ids[] = $link_text; $link_meta_ids[] = "location:all"; $link_meta_ids[] = "location:" . crawlHash($site_url); } $link_type = UrlParser::getDocumentType($url); $link_meta_ids[] = "media:all"; $link_meta_ids[] = "safe:all"; if (in_array($link_type, $IMAGE_TYPES)) { $link_meta_ids[] = "media:image"; if (isset($safe) && !$safe) { $link_meta_ids[] = "safe:false"; } } else { $link_meta_ids[] = "media:text"; } $link_meta_ids[] = "link:all"; return $link_meta_ids; }
/** * If pretty URL within admin config is switched on. We will apply pretty URL * to all the links in ATutor. This function will authenticate itself towards the current pages. * In our definition, admins, login, registration pages shouldn't have pretty url applied. However, * if one want to use url_rewrite on these pages, please force it by using the third parameter. * Note: If system config has turned off this feature, $force will have no effect. * @param string the Url should be a relative link, have to improve this later on, to check if * it's a relative link, if not, truncate it. * @param boolean Available values are AT_PRETTY_URL_IS_HEADER, AT_PRETTY_URL_NOT_HEADER(default) * use AT_PRETTY_URL_IS_HEADER if url_rewrite is used in php header('Location:..'), absolute path is needed for this. * @param boolean true to force the url_rewrite, false otheriwse. False is the default. * @author Harris Wong */ function url_rewrite($url, $is_rewriting_header = AT_PRETTY_URL_NOT_HEADER, $force = false) { global $_config, $db; $url_parser = new UrlParser(); $pathinfo = $url_parser->getPathArray(); /* If this is any kind of admins, don't prettify the url * $_SESSION['is_guest'] is used to check against login/register/browse page, the links on this page will * only be prettified when a user has logged in. * Had used $_SESSION[valid_user] before but it created this problem: * http://www.atutor.ca/atutor/mantis/view.php?id=3426 */ if ($force || isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) { //if course id is defined, apply pretty url. } else { if (admin_authenticate(AT_ADMIN_PRIV_ADMIN, AT_PRIV_RETURN) || isset($_SESSION['privileges']) && admin_authenticate($_SESSION['privileges'], AT_PRIV_RETURN) || isset($_SESSION['is_guest']) && $_SESSION['is_guest'] == 1) { return $url; } } //if we allow pretty url in the system if ($_config['pretty_url'] > 0) { $course_id = 0; //If we allow course dir name from sys perf if ($_config['course_dir_name'] > 0) { if (preg_match('/bounce.php\\?course=([\\d]+)$/', $url, $matches) == 1) { // bounce has the highest priority, even if session is set, work on // bounce first. $course_id = $url_parser->getCourseDirName($matches[1]); } elseif (isset($_REQUEST['course'])) { //jump menu $course_id = $url_parser->getCourseDirName($_REQUEST['course']); } elseif (isset($_REQUEST['p_course'])) { // is set when guests access public course. @see bounce.php $course_id = $url_parser->getCourseDirName($_REQUEST['p_course']); } elseif (isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) { $course_id = $url_parser->getCourseDirName($_SESSION['course_id']); } } else { if (isset($_SESSION['course_id'])) { $course_id = $_SESSION['course_id']; } } $url = $pathinfo[1]->convertToPrettyUrl($course_id, $url); } elseif ($_config['course_dir_name'] > 0) { //enabled course directory name, disabled pretty url if (preg_match('/bounce.php\\?course=([\\d]+)$/', $url, $matches) == 1) { // bounce has the highest priority, even if session is set, work on // bounce first. $course_id = $url_parser->getCourseDirName($matches[1]); } elseif (isset($_REQUEST['course'])) { $course_id = $url_parser->getCourseDirName($_REQUEST['course']); } elseif (isset($_REQUEST['p_course'])) { // is set when guests access public course. @see bounce.php $course_id = $url_parser->getCourseDirName($_REQUEST['p_course']); } elseif (isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) { $course_id = $url_parser->getCourseDirName($_SESSION['course_id']); } $url = $pathinfo[1]->convertToPrettyUrl($course_id, $url); } //instead of putting AT_BASE_HREF in all the headers location, we will put it here. //Abs paths are required for pretty url because otherwise the url location will be appeneded. //ie. ATutor_161/blogs/CoURSe_rOAd/blogs/view.php/ot/1/oid/1/ instead of // ATutor_161/CoURSe_rOAd/blogs/view.php/ot/1/oid/1/ if ($is_rewriting_header == true) { return AT_BASE_HREF . $url; } return $url; }
/** * Takes page summaries for RSS pages and the current query * and draws list of news links and a link to the news link subsearch * page if applicable. * * @param array $feed_pages page data from news feeds * @param string $base_query the query_string prefix * @param string $query the current search query * @param string $subsearch name of subsearch page this image group on * @param boolean $open_in_tabs whether new links should be opened in * tabs */ function render($feed_pages, $base_query, $query, $subsearch, $open_in_tabs = false) { if ($subsearch != 'news') { $not_news = true; ?> <h2><a href="<?php e("{$base_query}&q={$query}&s=news"); ?> " ><?php e(tl('feeds_helper_view_feed_results', $query)); ?> </a></h2> <?php } else { $not_news = false; } ?> <div class="feed-list"> <?php $time = time(); foreach ($feed_pages as $page) { $pub_date = $page[self::SUMMARY_OFFSET][0][4]; $encode_source = urlencode(urlencode($page[self::SOURCE_NAME])); if (isset($page[self::URL])) { if (strncmp($page[self::URL], "url|", 4) == 0) { $url_parts = explode("|", $page[self::URL]); $url = $url_parts[1]; $title = UrlParser::simplifyUrl($url, 60); $subtitle = "title='" . $page[self::URL] . "'"; } else { $url = $page[self::URL]; $title = $page[self::TITLE]; if (strlen(trim($title)) == 0) { $title = UrlParser::simplifyUrl($url, 60); } $subtitle = ""; } } else { $url = ""; $title = isset($page[self::TITLE]) ? $page[self::TITLE] : ""; $subtitle = ""; } $pub_date = $this->getPubdateString($time, $pub_date); if ($not_news) { ?> <div class="blockquote"> <a href="<?php e($page[self::URL]); ?> " rel="nofollow" <?php if ($open_in_tabs) { ?> target="_blank" <?php } ?> ><?php e($page[self::TITLE]); ?> </a> <a class="gray-link" rel='nofollow' href="<?php e($base_query . "&q=media:news:" . $encode_source . "&s=news"); ?> " ><?php e($page[self::SOURCE_NAME] . "</a>" . "<span class='gray'> - {$pub_date}</span>"); ?> </span> </div> <?php } else { ?> <div class="results"> <h2><a href="<?php e($page[self::URL]); ?> " rel="nofollow" <?php if ($open_in_tabs) { ?> target="_blank" <?php } ?> ><?php e($page[self::TITLE]); ?> </a>. <a class="gray-link" rel='nofollow' href="<?php e($base_query . "&q=media:news:" . $encode_source . "&s=news"); ?> " ><?php e($page[self::SOURCE_NAME] . "</a>" . "<span class='gray'> - {$pub_date}</span>"); ?> </h2> <p class="echo-link" <?php e($subtitle); ?> ><?php e(UrlParser::simplifyUrl($url, 100) . " "); ?> </p> <?php $description = isset($page[self::DESCRIPTION]) ? $page[self::DESCRIPTION] : ""; e("<p>{$description}</p>"); ?> </div> <?php } } ?> </div> <?php }
/** * Guess mime type based on extension of the file * * @param string $file_name name of the file * @return string $mime_type for the given file name */ static function guessMimeTypeFromFileName($file_name) { $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain'); $extension = UrlParser::getDocumentType($file_name); if (isset($mime_type_map[$extension])) { $mime_type = $mime_type_map[$extension]; } else { $mime_type = "text/plain"; } return $mime_type; }
/** * Gets the cached version of a web page from the machine on which it was * fetched. * * Complete cached versions of web pages typically only live on a fetcher * machine. The queue server machine typically only maintains summaries. * This method makes a REST request of a fetcher machine for a cached page * and get the results back. * * @param string $machine the ip address of domain name of the machine the * cached page lives on * @param string $machine_uri the path from document root on $machine where * the yioop scripts live * @param int $partition the partition in the WebArchiveBundle the page is * in * @param int $offset the offset in bytes into the WebArchive partition in * the WebArchiveBundle at which the cached page lives. * @param string $crawl_time the timestamp of the crawl the cache page is * from * @param int $instance_num which fetcher instance for the particular * fetcher crawled the page (if more than one), false otherwise * @return array page data of the cached page */ function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false) { $time = time(); $session = md5($time . AUTH_KEY); if ($machine == '::1') { //IPv6 :( $machine = "[::1]"; //used if the fetching and queue serving were on the same machine } // we assume all machines use the same scheme & port of the name server $port = UrlParser::getPort(NAME_SERVER); $scheme = UrlParser::getScheme(NAME_SERVER); $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}"; if ($instance_num !== false) { $request .= "&instance_num={$instance_num}"; } $tmp = FetchUrl::getPage($request); $page = @unserialize(base64_decode($tmp)); $page['REQUEST'] = $request; return $page; }