/** * Sets up the available perpage language options. * If handling data sent from a form, it stores cleaned versions of * the number of results per page and language options into a sesssion * */ function processRequest() { $data = array(); $view = "settings"; $changed_settings_flag = false; $crawl_model = $this->model("crawl"); if (isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; $data['ADMIN'] = 1; } else { $user = $_SERVER['REMOTE_ADDR']; } $data[CSRF_TOKEN] = $this->generateCSRFToken($user); $token_okay = $this->checkCSRFToken(CSRF_TOKEN, $user); $languages = $this->model("locale")->getLocaleList(); foreach ($languages as $language) { $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME']; } if ($token_okay && isset($_REQUEST['lang']) && in_array($_REQUEST['lang'], array_keys($data['LANGUAGES']))) { $_SESSION['l'] = $_REQUEST['lang']; setLocaleObject($_SESSION['l']); $changed_settings_flag = true; } $data['LOCALE_TAG'] = getLocaleTag(); $n = NUM_RESULTS_PER_PAGE; $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n); if ($token_okay && isset($_REQUEST['perpage']) && in_array($_REQUEST['perpage'], array_keys($data['PER_PAGE']))) { $_SESSION['MAX_PAGES_TO_SHOW'] = $_REQUEST['perpage']; $changed_settings_flag = true; } if (isset($_SESSION['MAX_PAGES_TO_SHOW'])) { $data['PER_PAGE_SELECTED'] = $_SESSION['MAX_PAGES_TO_SHOW']; } else { $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE; } if ($token_okay && isset($_REQUEST['perpage'])) { $_SESSION['OPEN_IN_TABS'] = isset($_REQUEST['open_in_tabs']) ? true : false; } if (isset($_SESSION['OPEN_IN_TABS'])) { $data['OPEN_IN_TABS'] = $_SESSION['OPEN_IN_TABS']; } else { $data['OPEN_IN_TABS'] = false; } $machine_urls = $this->model("machine")->getQueueServerUrls(); $crawls = $crawl_model->getCrawlList(false, true, $machine_urls, true); $data['CRAWLS'] = array(); foreach ($crawls as $crawl) { $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'] . " ... " . $crawl['COUNT'] . " urls"; } $mixes = $crawl_model->getMixList($user); foreach ($mixes as $mix) { $data['CRAWLS'][$mix['TIMESTAMP']] = $mix['NAME'] . " ... " . tl('settings_controller_crawl_mix'); } $crawl_stamps = array_keys($data['CRAWLS']); if ($token_okay) { $changed_settings_flag = $this->loggedInChangeSettings($data); } else { if (isset($_REQUEST['its']) && in_array($_REQUEST['its'], $crawl_stamps)) { $data['its'] = $_REQUEST['its']; } else { $data['its'] = $crawl_model->getCurrentIndexDatabaseName(); } } if ($changed_settings_flag) { $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('settings_controller_settings_saved') . "</h1>')"; if ($user != $_SERVER['REMOTE_ADDR']) { $this->model("user")->setUserSession($user, $_SESSION); } } $this->displayView($view, $data); }
/** * Runs the QueryTool on the supplied command line arguments */ function start() { global $argv, $INDEXING_PLUGINS; if (!isset($argv[1])) { $this->usageMessageAndExit(); } $query = $argv[1]; $results_per_page = isset($argv[2]) ? $argv[2] : 10; $limit = isset($argv[3]) ? $argv[3] : 0; setLocaleObject(getLocaleTag()); $start_time = microtime(); $controller = new SearchController($INDEXING_PLUGINS); $data = $controller->queryRequest($query, $results_per_page, $limit); if (!isset($data['PAGES'])) { $data['PAGES'] = array(); } foreach ($data['PAGES'] as $page) { echo "============\n"; echo "TITLE: " . trim($page[self::TITLE]) . "\n"; echo "URL: " . trim($page[self::URL]) . "\n"; echo "IPs: "; if (isset($page[self::IP_ADDRESSES])) { foreach ($page[self::IP_ADDRESSES] as $address) { echo $address . " "; } } echo "\n"; echo "DESCRIPTION: " . wordwrap(trim($page[self::DESCRIPTION])) . "\n"; echo "Rank: " . $page[self::DOC_RANK] . "\n"; echo "Relevance: " . $page[self::RELEVANCE] . "\n"; echo "Proximity: " . $page[self::PROXIMITY] . "\n"; echo "Score: " . $page[self::SCORE] . "\n"; echo "============\n\n"; } $data['ELAPSED_TIME'] = changeInMicrotime($start_time); echo "QUERY STATISTICS\n"; echo "============\n"; echo "ELAPSED TIME: " . $data['ELAPSED_TIME'] . "\n"; if (isset($data['LIMIT'])) { echo "LOW: " . $data['LIMIT'] . "\n"; } if (isset($data['HIGH'])) { echo "HIGH: " . min($data['TOTAL_ROWS'], $data['LIMIT'] + $data['RESULTS_PER_PAGE']) . "\n"; } if (isset($data['TOTAL_ROWS'])) { echo "TOTAL ROWS: " . $data['TOTAL_ROWS'] . "\n"; } if (isset($data['ERROR'])) { echo $data['ERROR'] . "\n"; } }
/** * Responsible for handling admin request related to the configure activity * * The configure activity allows a user to set the work directory for * storing data local to this SeekQuarry/Yioop instance. It also allows one * to set the default language of the installation, dbms info, robot info, * test info, as well as which machine acts as the queue server. * * @return array $data fields for available language, dbms, etc as well as * results of processing sub activity if any */ function configure() { $parent = $this->parent; $profile_model = $parent->model("profile"); $group_model = $parent->model("group"); $data = array(); $profile = array(); $data['SYSTEM_CHECK'] = $this->systemCheck(); $languages = $parent->model("locale")->getLocaleList(); foreach ($languages as $language) { $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME']; } if (isset($_REQUEST['lang']) && $_REQUEST['lang']) { $data['lang'] = $parent->clean($_REQUEST['lang'], "string"); $profile['DEFAULT_LOCALE'] = $data['lang']; setLocaleObject($data['lang']); } $data["ELEMENT"] = "configure"; $data['SCRIPT'] = ""; $data['PROFILE'] = false; if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) { if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) { $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY; $_REQUEST['arg'] = "directory"; @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php"); } $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string"); $data['PROFILE'] = true; if (strstr(PHP_OS, "WIN")) { //convert to forward slashes so consistent with rest of code $dir = str_replace("\\", "/", $dir); if ($dir[0] != "/" && $dir[1] != ":") { $data['PROFILE'] = false; } } else { if ($dir[0] != "/") { $data['PROFILE'] = false; } } if ($data['PROFILE'] == false) { $data["MESSAGE"] = tl('system_component_configure_use_absolute_path'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } if (strstr($dir . "/", BASE_DIR . "/")) { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } $data['WORK_DIRECTORY'] = $dir; } else { if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) { $data['WORK_DIRECTORY'] = WORK_DIRECTORY; $data['PROFILE'] = true; } } $arg = ""; if (isset($_REQUEST['arg'])) { $arg = $_REQUEST['arg']; } switch ($arg) { case "directory": if (!isset($data['WORK_DIRECTORY'])) { break; } if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_set'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);"; } else { if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) { if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) { $profile['DBMS'] = 'sqlite3'; $data['DBMS'] = 'sqlite3'; $profile['DB_NAME'] = 'default'; $data['DB_NAME'] = 'default'; $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot'); $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT']; $uri = UrlParser::getPath($_SERVER['REQUEST_URI']); $http = isset($_SERVER['HTTPS']) ? "https://" : "http://"; $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri; $data['NAME_SERVER'] = $profile['NAME_SERVER']; $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time()); $data['AUTH_KEY'] = $profile['AUTH_KEY']; $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus(); $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time(); $profile['ROBOT_INSTANCE'] = $robot_instance; $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE']; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) { if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) { $data["MESSAGE"] = tl('system_component_configure_work_profile_made'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $data['PROFILE'] = true; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_set_config'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_create_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } break; case "profile": $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE')); $data['DEBUG_LEVEL'] = 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0; $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL']; $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); $folder = APP_DIR . "/resources"; if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) { $data["MESSAGE"] = tl('system_component_no_resource_folder'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) { if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") { if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') { $data["MESSAGE"] = tl('system_component_invalid_filetype'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } if ($_FILES[$field]['size'] > THUMB_SIZE) { $data["MESSAGE"] = tl('system_component_file_too_big'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } $profile[$field] = array(); $profile[$field]['name'] = $_FILES[$field]['name']; $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name']; $data[$field] = "./?c=resource&a=get&" . "f=resources&n=" . $profile[$field]['name']; } } if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) { $data['MESSAGE'] = tl('system_component_configure_profile_change'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) { $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&a=configure&" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);"; } } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; case "reset": $base_url = NAME_SERVER; if (defined("BASE_URL")) { $base_url = BASE_URL; } $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => ""); $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); foreach ($old_profile as $key => $value) { $data[$key] = $value; } $tmp_image = $old_profile['BACKGROUND_IMAGE']; $old_profile['BACKGROUND_IMAGE'] = ""; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) { $old_profile['BACKGROUND_IMAGE'] = $tmp_image; foreach ($profile as $key => $value) { $data[$key] = $value; if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") { $resource_name = APP_DIR . "/resources/" . $old_profile[$key]; if (file_exists($resource_name)) { unlink($resource_name); } } } $data['MESSAGE'] = tl('system_component_configure_reset_completed'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; default: if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); } else { $data['WORK_DIRECTORY'] = ""; $data['PROFILE'] = false; } } $data['advanced'] = "false"; if ($data['PROFILE']) { $locale_tag = getLocaleTag(); $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN"); foreach ($not_null_fields as $field => $default) { if (!$data[$field]) { $data[$field] = $default; } } if (isset($_REQUEST['ROBOT_DESCRIPTION'])) { $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN); $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", ""); } $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit"); $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot'); if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') { $data['advanced'] = "true"; } $data['SCRIPT'] .= <<<EOD setDisplay('advance-configure', {$data['advanced']}); setDisplay('advance-robot', {$data['advanced']}); function toggleAdvance() { var advanced = elt('a-settings'); advanced.value = (advanced.value =='true') ? 'false' : 'true'; var value = (advanced.value == 'true') ? true : false; setDisplay('advance-configure', value); setDisplay('advance-robot', value); } EOD; } $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n"; return $data; }
/** * Implements post processing of recipes. recipes are extracted * ingredients are scrubbed and recipes are clustered. The clustered * recipes are added back to the index. * * @param string $index_name index name of the current crawl. */ function postProcessing($index_name) { global $INDEXING_PLUGINS; if (!class_exists("SplHeap")) { crawlLog("...Recipe Plugin Requires SPLHeap for clustering!"); crawlLog("...Aborting plugin"); return; } $locale_tag = guessLocale(); setLocaleObject($locale_tag); $search_controller = new SearchController($INDEXING_PLUGINS); $query = "recipe:all i:{$index_name}"; crawlLog("...Running Recipe Plugin!"); crawlLog("...Finding docs tagged as recipes."); $more_docs = true; $raw_recipes = array(); $limit = 0; $num = 100; while ($more_docs) { $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name); if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) { $raw_recipes = array_merge($raw_recipes, $results["PAGES"]); } crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . "."); $limit += $num_results; if (isset($results["SAVE_POINT"])) { $end = true; foreach ($results["SAVE_POINT"] as $save_point) { if ($save_point != -1) { $end = false; } } if ($end) { $more_docs = false; } } else { $more_docs = false; } } crawlLog("...Clustering."); // only cluster if would make more than one cluster if (count($raw_recipes) * CLUSTER_RATIO > 1) { $recipes = array(); $i = 0; foreach ($raw_recipes as $raw_recipe) { $description = $raw_recipe[self::DESCRIPTION]; $ingredients = explode("||", $description); if (is_array($ingredients) && count($ingredients) > 1) { $recipes[$i][0] = $raw_recipe[self::TITLE]; $recipes[$i][1] = $ingredients; $recipes[$i][2] = crawlHash($raw_recipe[self::URL]); $recipes[$i][3] = $raw_recipe; $i++; } } $recipes_ingredients = array(); $count = count($recipes); foreach ($recipes as $key => $recipe) { foreach ($recipe[1] as $index => $ingredient) { if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") { $mainIngredient = $this->getIngredientName((string) $ingredient); if (strlen($mainIngredient) != 0) { $recipe[1][$index] = $mainIngredient; } else { unset($recipe[1][$index]); } } else { unset($recipe[1][$index]); } } $recipes[$key] = $recipe; } $count = count($recipes); $k = 0; $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray'); for ($i = 0; $i < $count; $i++) { $recipe1_main_ingredient = ""; $recipe1 = $recipes[$i][1]; $recipe_name = $recipes[$i][0]; $recipe1_title = strtolower($recipes[$i][0]); $distinct_ingredients[$recipe_name] = $recipes[$i][1]; $doc_keys[$recipe_name] = $recipes[$i][2]; $recipes_summary[$recipe_name] = $recipes[$i][3]; for ($j = $i + 1; $j < $count; $j++) { $recipe2_main_ingredient = ""; $recipe2 = $recipes[$j][1]; $recipe2_title = strtolower($recipes[$j][0]); $weights[$k][0] = $recipes[$i][0]; $weights[$k][1] = $recipes[$j][0]; $merge_array = array_merge($recipe1, $recipe2); $vector_array = array_unique($merge_array); sort($vector_array); $recipe1_vector = array_fill_keys($vector_array, 0); $recipe2_vector = array_fill_keys($vector_array, 0); foreach ($recipe1 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe1_title, $ingredient)) { $recipe1_main_ingredient = $ingredient; } } $recipe1_vector[$ingredient] = 1; } foreach ($recipe2 as $ingredient) { if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) { if (strstr($recipe2_title, $ingredient)) { $recipe2_main_ingredient = $ingredient; } } $recipe2_vector[$ingredient] = 1; } $edge_weight = 0; $matches = 1; foreach ($vector_array as $vector) { $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector]; $vector_diff[$vector] = pow($diff, 2); if (abs($diff) == 1) { $matches += 1; } $edge_weight += $vector_diff[$vector]; } $main_ingredient_match = 1; if ($recipe1_main_ingredient != $recipe2_main_ingredient) { $main_ingredient_match = 1000; } $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match; $weights[$k][2] = $edge_weight; $k++; } } crawlLog("...Making new shard with clustered recipes as docs."); $clusters = kruskalClustering($weights, $count, $distinct_ingredients); $index_shard = new IndexShard("cluster_shard"); $word_lists = array(); $recipe_sites = array(); foreach ($clusters as $cluster) { $count = count($cluster); for ($i = 0; $i < $count - 1; $i++) { $meta_ids = array(); $summary = array(); $recipe = $cluster[$i]; $summary[self::URL] = $recipes_summary[$recipe][self::URL]; $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE]; $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION]; $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP]; $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING]; $summary[self::HASH] = $recipes_summary[$recipe][self::HASH]; $doc_keys[$recipe] = crawlHash($summary[self::URL], true); $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1); $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost; $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE]; $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE]; $recipe_sites[] = $summary; $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]); crawlLog("ingredient:" . $cluster["ingredient"]); if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) { crawlLog("Problem inserting recipe: " . $summary[self::TITLE]); } } } $shard_string = $index_shard->save(true); $index_shard = IndexShard::load("cluster_shard", $shard_string); unset($shard_string); crawlLog("...Adding recipe shard to index archive bundle"); $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name; $index_archive = new IndexArchiveBundle($dir, false); if ($index_shard->word_docs_packed) { $index_shard->unpackWordDocs(); } $generation = $index_archive->initGenerationToAdd($index_shard); if (isset($recipe_sites)) { crawlLog("... Adding " . count($recipe_sites) . " recipe docs."); $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0); } $k = 0; foreach ($recipe_sites as $site) { $recipe = $site[self::TITLE]; $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1); $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } $index_shard->changeDocumentOffsets($summary_offsets); $index_archive->addIndexData($index_shard); $index_archive->saveAndAddCurrentShardDictionary(); $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name); crawlLog("...Recipe plugin finished."); } }
} /** Calculate base directory of script @ignore*/ define("BASE_DIR", substr(dirname(realpath($_SERVER['PHP_SELF'])), 0, -strlen("/configs"))); /** Load in global configuration settings */ require_once BASE_DIR . '/configs/config.php'; /** Loads common constants for web crawling*/ require_once BASE_DIR . "/lib/crawl_constants.php"; /** Loads common constants for web crawling*/ require_once BASE_DIR . "/lib/locale_functions.php"; /** Loads common utility functions*/ require_once BASE_DIR . "/lib/utility.php"; mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); $locale_tag = guessLocale(); $locale = NULL; setLocaleObject($locale_tag); /** * This tool is essentially a set of views for the * logic that is done in admin_controller.php */ require_once BASE_DIR . "/controllers/admin_controller.php"; /** * Provides a command-line interface way to configure a Yioop Instance. * Unlike the web interface this interface is English-only. */ class ConfigureTool { /** * Used to hold an AdminController object used to manipulate the * Yioop configuration * @var object
/** * Creates a web archive iterator with the given parameters. * * @param string $mix_timestamp timestamp of the crawl mix to * iterate over the pages of * @param string $result_timestamp timestamp of the web archive bundle * results are being stored in */ function __construct($mix_timestamp, $result_timestamp) { global $INDEXING_PLUGINS; setLocaleObject(getLocaleTag()); $this->mix_timestamp = $mix_timestamp; $this->result_timestamp = $result_timestamp; $this->query = "site:any m:" . $mix_timestamp; $this->searchController = new SearchController($INDEXING_PLUGINS); $archive_name = $this->getArchiveName($result_timestamp); if (!file_exists($archive_name)) { mkdir($archive_name); } if (file_exists("{$archive_name}/iterate_status.txt")) { $this->restoreCheckpoint(); } else { $this->reset(); } }
Loads common constants for web crawling -- we use these constants to get data out of the search response we get back. */ require_once BASE_DIR . "/lib/crawl_constants.php"; /**Load search controller class needed to get search results*/ require_once BASE_DIR . "/controllers/search_controller.php"; /* * Set-up multi-byte string handling to use UTF-8 */ mb_internal_encoding("UTF-8"); mb_regex_encoding("UTF-8"); /**Cached pages part of search API needs global locale functions*/ require_once BASE_DIR . "/lib/locale_functions.php"; $locale = NULL; setLocaleObject("en-US"); /** * If the index being used made use of any indexing plugins, we can * declare them here. */ $indexing_plugins = array(); $controller = new SearchController($indexing_plugins); // ###### /* Now we can do queries! First do a simple search on art and print the results */ echo "\n\n\nAn example of a query request with the search API:\n"; $query = "art i:1317414322"; /* i:1317414322 is the timestamp of the index to use. API requires that a default index be set even though the query might specify to use a different one. The query string we pass to the
/** * Deletes the archive iterator and savepoint files created during the * process of iterating through a crawl mix. * * @param int $timestamp The timestamp of the crawl mix */ function deleteCrawlMixIteratorState($timestamp) { global $INDEXING_PLUGINS; setLocaleObject(getLocaleTag()); $searchController = new SearchController($INDEXING_PLUGINS); $searchController->clearQuerySavepoint($timestamp); $archive_dir = WORK_DIRECTORY . "/schedules/" . self::name_archive_iterator . $timestamp; if (file_exists($archive_dir)) { $this->db->unlinkRecursive($archive_dir); } }