Ejemplo n.º 1
0
 /**
  * Sets up the available perpage language options.
  * If handling data sent from a  form, it stores cleaned versions of
  * the number of results per page and language options into a sesssion
  *
  */
 function processRequest()
 {
     $data = array();
     $view = "settings";
     $changed_settings_flag = false;
     $crawl_model = $this->model("crawl");
     if (isset($_SESSION['USER_ID'])) {
         $user = $_SESSION['USER_ID'];
         $data['ADMIN'] = 1;
     } else {
         $user = $_SERVER['REMOTE_ADDR'];
     }
     $data[CSRF_TOKEN] = $this->generateCSRFToken($user);
     $token_okay = $this->checkCSRFToken(CSRF_TOKEN, $user);
     $languages = $this->model("locale")->getLocaleList();
     foreach ($languages as $language) {
         $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME'];
     }
     if ($token_okay && isset($_REQUEST['lang']) && in_array($_REQUEST['lang'], array_keys($data['LANGUAGES']))) {
         $_SESSION['l'] = $_REQUEST['lang'];
         setLocaleObject($_SESSION['l']);
         $changed_settings_flag = true;
     }
     $data['LOCALE_TAG'] = getLocaleTag();
     $n = NUM_RESULTS_PER_PAGE;
     $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n);
     if ($token_okay && isset($_REQUEST['perpage']) && in_array($_REQUEST['perpage'], array_keys($data['PER_PAGE']))) {
         $_SESSION['MAX_PAGES_TO_SHOW'] = $_REQUEST['perpage'];
         $changed_settings_flag = true;
     }
     if (isset($_SESSION['MAX_PAGES_TO_SHOW'])) {
         $data['PER_PAGE_SELECTED'] = $_SESSION['MAX_PAGES_TO_SHOW'];
     } else {
         $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE;
     }
     if ($token_okay && isset($_REQUEST['perpage'])) {
         $_SESSION['OPEN_IN_TABS'] = isset($_REQUEST['open_in_tabs']) ? true : false;
     }
     if (isset($_SESSION['OPEN_IN_TABS'])) {
         $data['OPEN_IN_TABS'] = $_SESSION['OPEN_IN_TABS'];
     } else {
         $data['OPEN_IN_TABS'] = false;
     }
     $machine_urls = $this->model("machine")->getQueueServerUrls();
     $crawls = $crawl_model->getCrawlList(false, true, $machine_urls, true);
     $data['CRAWLS'] = array();
     foreach ($crawls as $crawl) {
         $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION'] . " ... " . $crawl['COUNT'] . " urls";
     }
     $mixes = $crawl_model->getMixList($user);
     foreach ($mixes as $mix) {
         $data['CRAWLS'][$mix['TIMESTAMP']] = $mix['NAME'] . " ... " . tl('settings_controller_crawl_mix');
     }
     $crawl_stamps = array_keys($data['CRAWLS']);
     if ($token_okay) {
         $changed_settings_flag = $this->loggedInChangeSettings($data);
     } else {
         if (isset($_REQUEST['its']) && in_array($_REQUEST['its'], $crawl_stamps)) {
             $data['its'] = $_REQUEST['its'];
         } else {
             $data['its'] = $crawl_model->getCurrentIndexDatabaseName();
         }
     }
     if ($changed_settings_flag) {
         $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('settings_controller_settings_saved') . "</h1>')";
         if ($user != $_SERVER['REMOTE_ADDR']) {
             $this->model("user")->setUserSession($user, $_SESSION);
         }
     }
     $this->displayView($view, $data);
 }
Ejemplo n.º 2
0
 /**
  * Runs the QueryTool on the supplied command line arguments
  */
 function start()
 {
     global $argv, $INDEXING_PLUGINS;
     if (!isset($argv[1])) {
         $this->usageMessageAndExit();
     }
     $query = $argv[1];
     $results_per_page = isset($argv[2]) ? $argv[2] : 10;
     $limit = isset($argv[3]) ? $argv[3] : 0;
     setLocaleObject(getLocaleTag());
     $start_time = microtime();
     $controller = new SearchController($INDEXING_PLUGINS);
     $data = $controller->queryRequest($query, $results_per_page, $limit);
     if (!isset($data['PAGES'])) {
         $data['PAGES'] = array();
     }
     foreach ($data['PAGES'] as $page) {
         echo "============\n";
         echo "TITLE: " . trim($page[self::TITLE]) . "\n";
         echo "URL: " . trim($page[self::URL]) . "\n";
         echo "IPs: ";
         if (isset($page[self::IP_ADDRESSES])) {
             foreach ($page[self::IP_ADDRESSES] as $address) {
                 echo $address . " ";
             }
         }
         echo "\n";
         echo "DESCRIPTION: " . wordwrap(trim($page[self::DESCRIPTION])) . "\n";
         echo "Rank: " . $page[self::DOC_RANK] . "\n";
         echo "Relevance: " . $page[self::RELEVANCE] . "\n";
         echo "Proximity: " . $page[self::PROXIMITY] . "\n";
         echo "Score: " . $page[self::SCORE] . "\n";
         echo "============\n\n";
     }
     $data['ELAPSED_TIME'] = changeInMicrotime($start_time);
     echo "QUERY STATISTICS\n";
     echo "============\n";
     echo "ELAPSED TIME: " . $data['ELAPSED_TIME'] . "\n";
     if (isset($data['LIMIT'])) {
         echo "LOW: " . $data['LIMIT'] . "\n";
     }
     if (isset($data['HIGH'])) {
         echo "HIGH: " . min($data['TOTAL_ROWS'], $data['LIMIT'] + $data['RESULTS_PER_PAGE']) . "\n";
     }
     if (isset($data['TOTAL_ROWS'])) {
         echo "TOTAL ROWS: " . $data['TOTAL_ROWS'] . "\n";
     }
     if (isset($data['ERROR'])) {
         echo $data['ERROR'] . "\n";
     }
 }
Ejemplo n.º 3
0
    /**
     * Responsible for handling admin request related to the configure activity
     *
     * The configure activity allows a user to set the work directory for
     * storing data local to this SeekQuarry/Yioop instance. It also allows one
     * to set the default language of the installation, dbms info, robot info,
     * test info, as well as which machine acts as the queue server.
     *
     * @return array $data fields for available language, dbms, etc as well as
     *     results of processing sub activity if any
     */
    function configure()
    {
        $parent = $this->parent;
        $profile_model = $parent->model("profile");
        $group_model = $parent->model("group");
        $data = array();
        $profile = array();
        $data['SYSTEM_CHECK'] = $this->systemCheck();
        $languages = $parent->model("locale")->getLocaleList();
        foreach ($languages as $language) {
            $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME'];
        }
        if (isset($_REQUEST['lang']) && $_REQUEST['lang']) {
            $data['lang'] = $parent->clean($_REQUEST['lang'], "string");
            $profile['DEFAULT_LOCALE'] = $data['lang'];
            setLocaleObject($data['lang']);
        }
        $data["ELEMENT"] = "configure";
        $data['SCRIPT'] = "";
        $data['PROFILE'] = false;
        if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) {
            if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) {
                $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $_REQUEST['arg'] = "directory";
                @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php");
            }
            $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string");
            $data['PROFILE'] = true;
            if (strstr(PHP_OS, "WIN")) {
                //convert to forward slashes so consistent with rest of code
                $dir = str_replace("\\", "/", $dir);
                if ($dir[0] != "/" && $dir[1] != ":") {
                    $data['PROFILE'] = false;
                }
            } else {
                if ($dir[0] != "/") {
                    $data['PROFILE'] = false;
                }
            }
            if ($data['PROFILE'] == false) {
                $data["MESSAGE"] = tl('system_component_configure_use_absolute_path');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            if (strstr($dir . "/", BASE_DIR . "/")) {
                $data['PROFILE'] = false;
                $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            $data['WORK_DIRECTORY'] = $dir;
        } else {
            if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) {
                $data['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $data['PROFILE'] = true;
            }
        }
        $arg = "";
        if (isset($_REQUEST['arg'])) {
            $arg = $_REQUEST['arg'];
        }
        switch ($arg) {
            case "directory":
                if (!isset($data['WORK_DIRECTORY'])) {
                    break;
                }
                if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                    $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                    $data["MESSAGE"] = tl('system_component_configure_work_dir_set');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);";
                } else {
                    if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) {
                        if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) {
                            $profile['DBMS'] = 'sqlite3';
                            $data['DBMS'] = 'sqlite3';
                            $profile['DB_NAME'] = 'default';
                            $data['DB_NAME'] = 'default';
                            $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot');
                            $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT'];
                            $uri = UrlParser::getPath($_SERVER['REQUEST_URI']);
                            $http = isset($_SERVER['HTTPS']) ? "https://" : "http://";
                            $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri;
                            $data['NAME_SERVER'] = $profile['NAME_SERVER'];
                            $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time());
                            $data['AUTH_KEY'] = $profile['AUTH_KEY'];
                            $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus();
                            $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time();
                            $profile['ROBOT_INSTANCE'] = $robot_instance;
                            $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE'];
                            if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) {
                                if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) {
                                    $data["MESSAGE"] = tl('system_component_configure_work_profile_made');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                                    $data['PROFILE'] = true;
                                } else {
                                    $data['PROFILE'] = false;
                                    $data["MESSAGE"] = tl('system_component_configure_no_set_config');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                }
                            } else {
                                $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                                $data['PROFILE'] = false;
                                $data["MESSAGE"] = tl('system_component_configure_no_create_profile');
                                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);";
                            }
                        } else {
                            $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                            $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                            $data['PROFILE'] = false;
                        }
                    } else {
                        $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                        $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                        $data['PROFILE'] = false;
                    }
                }
                break;
            case "profile":
                $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE'));
                $data['DEBUG_LEVEL'] = 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0;
                $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL'];
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                $folder = APP_DIR . "/resources";
                if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) {
                    $data["MESSAGE"] = tl('system_component_no_resource_folder');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                    return $data;
                }
                foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) {
                    if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") {
                        if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') {
                            $data["MESSAGE"] = tl('system_component_invalid_filetype');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        if ($_FILES[$field]['size'] > THUMB_SIZE) {
                            $data["MESSAGE"] = tl('system_component_file_too_big');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        $profile[$field] = array();
                        $profile[$field]['name'] = $_FILES[$field]['name'];
                        $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name'];
                        $data[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $profile[$field]['name'];
                    }
                }
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) {
                    $data['MESSAGE'] = tl('system_component_configure_profile_change');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                    if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) {
                        $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&amp;a=configure&amp;" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);";
                    }
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            case "reset":
                $base_url = NAME_SERVER;
                if (defined("BASE_URL")) {
                    $base_url = BASE_URL;
                }
                $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => "");
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                foreach ($old_profile as $key => $value) {
                    $data[$key] = $value;
                }
                $tmp_image = $old_profile['BACKGROUND_IMAGE'];
                $old_profile['BACKGROUND_IMAGE'] = "";
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) {
                    $old_profile['BACKGROUND_IMAGE'] = $tmp_image;
                    foreach ($profile as $key => $value) {
                        $data[$key] = $value;
                        if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") {
                            $resource_name = APP_DIR . "/resources/" . $old_profile[$key];
                            if (file_exists($resource_name)) {
                                unlink($resource_name);
                            }
                        }
                    }
                    $data['MESSAGE'] = tl('system_component_configure_reset_completed');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            default:
                if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                } else {
                    $data['WORK_DIRECTORY'] = "";
                    $data['PROFILE'] = false;
                }
        }
        $data['advanced'] = "false";
        if ($data['PROFILE']) {
            $locale_tag = getLocaleTag();
            $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN");
            foreach ($not_null_fields as $field => $default) {
                if (!$data[$field]) {
                    $data[$field] = $default;
                }
            }
            if (isset($_REQUEST['ROBOT_DESCRIPTION'])) {
                $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN);
                $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", "");
            }
            $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit");
            $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot');
            if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') {
                $data['advanced'] = "true";
            }
            $data['SCRIPT'] .= <<<EOD
    setDisplay('advance-configure', {$data['advanced']});
    setDisplay('advance-robot', {$data['advanced']});
    function toggleAdvance() {
        var advanced = elt('a-settings');
        advanced.value = (advanced.value =='true')
            ? 'false' : 'true';
        var value = (advanced.value == 'true') ? true : false;
        setDisplay('advance-configure', value);
        setDisplay('advance-robot', value);
    }
EOD;
        }
        $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n";
        return $data;
    }
Ejemplo n.º 4
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
Ejemplo n.º 5
0
}
/** Calculate base directory of script @ignore*/
define("BASE_DIR", substr(dirname(realpath($_SERVER['PHP_SELF'])), 0, -strlen("/configs")));
/** Load in global configuration settings */
require_once BASE_DIR . '/configs/config.php';
/** Loads common constants for web crawling*/
require_once BASE_DIR . "/lib/crawl_constants.php";
/** Loads common constants for web crawling*/
require_once BASE_DIR . "/lib/locale_functions.php";
/** Loads common utility functions*/
require_once BASE_DIR . "/lib/utility.php";
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
$locale_tag = guessLocale();
$locale = NULL;
setLocaleObject($locale_tag);
/**
 * This tool is essentially a set of views for the
 * logic that is done in admin_controller.php
 */
require_once BASE_DIR . "/controllers/admin_controller.php";
/**
 * Provides a command-line interface way to configure a Yioop Instance.
 * Unlike the web interface this interface is English-only.
 */
class ConfigureTool
{
    /**
     * Used to hold an AdminController object used to manipulate the
     * Yioop configuration
     * @var object
Ejemplo n.º 6
0
 /**
  * Creates a web archive iterator with the given parameters.
  *
  * @param string $mix_timestamp timestamp of the crawl mix to
  *     iterate over the pages of
  * @param string $result_timestamp timestamp of the web archive bundle
  *     results are being stored in
  */
 function __construct($mix_timestamp, $result_timestamp)
 {
     global $INDEXING_PLUGINS;
     setLocaleObject(getLocaleTag());
     $this->mix_timestamp = $mix_timestamp;
     $this->result_timestamp = $result_timestamp;
     $this->query = "site:any m:" . $mix_timestamp;
     $this->searchController = new SearchController($INDEXING_PLUGINS);
     $archive_name = $this->getArchiveName($result_timestamp);
     if (!file_exists($archive_name)) {
         mkdir($archive_name);
     }
     if (file_exists("{$archive_name}/iterate_status.txt")) {
         $this->restoreCheckpoint();
     } else {
         $this->reset();
     }
 }
Ejemplo n.º 7
0
    Loads common constants for web crawling --
    we use these constants to get data out of the search response
    we get back.
*/
require_once BASE_DIR . "/lib/crawl_constants.php";
/**Load search controller class needed to get search results*/
require_once BASE_DIR . "/controllers/search_controller.php";
/*
 * Set-up multi-byte string handling to use UTF-8
 */
mb_internal_encoding("UTF-8");
mb_regex_encoding("UTF-8");
/**Cached pages part of search API needs global locale functions*/
require_once BASE_DIR . "/lib/locale_functions.php";
$locale = NULL;
setLocaleObject("en-US");
/**
 * If the index being used made use of any indexing plugins, we can
 * declare them here.
 */
$indexing_plugins = array();
$controller = new SearchController($indexing_plugins);
// ######
/*
 Now we can do queries! First do a simple search on art and print the results
*/
echo "\n\n\nAn example of a query request with the search API:\n";
$query = "art i:1317414322";
/* i:1317414322 is the timestamp of the index to use.
     API requires that a default index be set even though the query might
     specify to use a different one. The query string we pass to the
Ejemplo n.º 8
0
 /**
  * Deletes the archive iterator and savepoint files created during the
  * process of iterating through a crawl mix.
  *
  * @param int $timestamp The timestamp of the crawl mix
  */
 function deleteCrawlMixIteratorState($timestamp)
 {
     global $INDEXING_PLUGINS;
     setLocaleObject(getLocaleTag());
     $searchController = new SearchController($INDEXING_PLUGINS);
     $searchController->clearQuerySavepoint($timestamp);
     $archive_dir = WORK_DIRECTORY . "/schedules/" . self::name_archive_iterator . $timestamp;
     if (file_exists($archive_dir)) {
         $this->db->unlinkRecursive($archive_dir);
     }
 }