Пример #1
0
    /**
     * Responsible for handling admin request related to the configure activity
     *
     * The configure activity allows a user to set the work directory for
     * storing data local to this SeekQuarry/Yioop instance. It also allows one
     * to set the default language of the installation, dbms info, robot info,
     * test info, as well as which machine acts as the queue server.
     *
     * @return array $data fields for available language, dbms, etc as well as
     *     results of processing sub activity if any
     */
    function configure()
    {
        $parent = $this->parent;
        $profile_model = $parent->model("profile");
        $group_model = $parent->model("group");
        $data = array();
        $profile = array();
        $data['SYSTEM_CHECK'] = $this->systemCheck();
        $languages = $parent->model("locale")->getLocaleList();
        foreach ($languages as $language) {
            $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME'];
        }
        if (isset($_REQUEST['lang']) && $_REQUEST['lang']) {
            $data['lang'] = $parent->clean($_REQUEST['lang'], "string");
            $profile['DEFAULT_LOCALE'] = $data['lang'];
            setLocaleObject($data['lang']);
        }
        $data["ELEMENT"] = "configure";
        $data['SCRIPT'] = "";
        $data['PROFILE'] = false;
        if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) {
            if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) {
                $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $_REQUEST['arg'] = "directory";
                @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php");
            }
            $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string");
            $data['PROFILE'] = true;
            if (strstr(PHP_OS, "WIN")) {
                //convert to forward slashes so consistent with rest of code
                $dir = str_replace("\\", "/", $dir);
                if ($dir[0] != "/" && $dir[1] != ":") {
                    $data['PROFILE'] = false;
                }
            } else {
                if ($dir[0] != "/") {
                    $data['PROFILE'] = false;
                }
            }
            if ($data['PROFILE'] == false) {
                $data["MESSAGE"] = tl('system_component_configure_use_absolute_path');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            if (strstr($dir . "/", BASE_DIR . "/")) {
                $data['PROFILE'] = false;
                $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            $data['WORK_DIRECTORY'] = $dir;
        } else {
            if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) {
                $data['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $data['PROFILE'] = true;
            }
        }
        $arg = "";
        if (isset($_REQUEST['arg'])) {
            $arg = $_REQUEST['arg'];
        }
        switch ($arg) {
            case "directory":
                if (!isset($data['WORK_DIRECTORY'])) {
                    break;
                }
                if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                    $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                    $data["MESSAGE"] = tl('system_component_configure_work_dir_set');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);";
                } else {
                    if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) {
                        if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) {
                            $profile['DBMS'] = 'sqlite3';
                            $data['DBMS'] = 'sqlite3';
                            $profile['DB_NAME'] = 'default';
                            $data['DB_NAME'] = 'default';
                            $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot');
                            $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT'];
                            $uri = UrlParser::getPath($_SERVER['REQUEST_URI']);
                            $http = isset($_SERVER['HTTPS']) ? "https://" : "http://";
                            $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri;
                            $data['NAME_SERVER'] = $profile['NAME_SERVER'];
                            $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time());
                            $data['AUTH_KEY'] = $profile['AUTH_KEY'];
                            $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus();
                            $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time();
                            $profile['ROBOT_INSTANCE'] = $robot_instance;
                            $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE'];
                            if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) {
                                if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) {
                                    $data["MESSAGE"] = tl('system_component_configure_work_profile_made');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                                    $data['PROFILE'] = true;
                                } else {
                                    $data['PROFILE'] = false;
                                    $data["MESSAGE"] = tl('system_component_configure_no_set_config');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                }
                            } else {
                                $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                                $data['PROFILE'] = false;
                                $data["MESSAGE"] = tl('system_component_configure_no_create_profile');
                                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);";
                            }
                        } else {
                            $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                            $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                            $data['PROFILE'] = false;
                        }
                    } else {
                        $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                        $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                        $data['PROFILE'] = false;
                    }
                }
                break;
            case "profile":
                $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE'));
                $data['DEBUG_LEVEL'] = 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0;
                $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL'];
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                $folder = APP_DIR . "/resources";
                if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) {
                    $data["MESSAGE"] = tl('system_component_no_resource_folder');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                    return $data;
                }
                foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) {
                    if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") {
                        if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') {
                            $data["MESSAGE"] = tl('system_component_invalid_filetype');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        if ($_FILES[$field]['size'] > THUMB_SIZE) {
                            $data["MESSAGE"] = tl('system_component_file_too_big');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        $profile[$field] = array();
                        $profile[$field]['name'] = $_FILES[$field]['name'];
                        $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name'];
                        $data[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $profile[$field]['name'];
                    }
                }
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) {
                    $data['MESSAGE'] = tl('system_component_configure_profile_change');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                    if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) {
                        $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&amp;a=configure&amp;" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);";
                    }
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            case "reset":
                $base_url = NAME_SERVER;
                if (defined("BASE_URL")) {
                    $base_url = BASE_URL;
                }
                $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => "");
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                foreach ($old_profile as $key => $value) {
                    $data[$key] = $value;
                }
                $tmp_image = $old_profile['BACKGROUND_IMAGE'];
                $old_profile['BACKGROUND_IMAGE'] = "";
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) {
                    $old_profile['BACKGROUND_IMAGE'] = $tmp_image;
                    foreach ($profile as $key => $value) {
                        $data[$key] = $value;
                        if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") {
                            $resource_name = APP_DIR . "/resources/" . $old_profile[$key];
                            if (file_exists($resource_name)) {
                                unlink($resource_name);
                            }
                        }
                    }
                    $data['MESSAGE'] = tl('system_component_configure_reset_completed');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            default:
                if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                } else {
                    $data['WORK_DIRECTORY'] = "";
                    $data['PROFILE'] = false;
                }
        }
        $data['advanced'] = "false";
        if ($data['PROFILE']) {
            $locale_tag = getLocaleTag();
            $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN");
            foreach ($not_null_fields as $field => $default) {
                if (!$data[$field]) {
                    $data[$field] = $default;
                }
            }
            if (isset($_REQUEST['ROBOT_DESCRIPTION'])) {
                $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN);
                $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", "");
            }
            $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit");
            $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot');
            if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') {
                $data['advanced'] = "true";
            }
            $data['SCRIPT'] .= <<<EOD
    setDisplay('advance-configure', {$data['advanced']});
    setDisplay('advance-robot', {$data['advanced']});
    function toggleAdvance() {
        var advanced = elt('a-settings');
        advanced.value = (advanced.value =='true')
            ? 'false' : 'true';
        var value = (advanced.value == 'true') ? true : false;
        setDisplay('advance-configure', value);
        setDisplay('advance-robot', value);
    }
EOD;
        }
        $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n";
        return $data;
    }
Пример #2
0
 /**
  * Calculates the meta words to be associated with a given downloaded
  * document. These words will be associated with the document in the
  * index for (server:apache) even if the document itself did not contain
  * them.
  *
  * @param array& $site associated array containing info about a downloaded
  *     (or read from archive) document.
  * @param array $video_sources used to check if a page should be marked as
  *      having meta media:video
  * @return array of meta words to be associate with this document
  */
 static function calculateMetas(&$site, $video_sources = array())
 {
     $meta_ids = array();
     // handles user added meta words
     if (isset($site[CrawlConstants::META_WORDS])) {
         $meta_ids = $site[CrawlConstants::META_WORDS];
     }
     /*
         Handle the built-in meta words. For example
         store the sites the doc_key belongs to,
         so you can search by site
     */
     $url_sites = UrlParser::getHostPaths($site[CrawlConstants::URL]);
     $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[CrawlConstants::URL]));
     $meta_ids[] = 'site:all';
     foreach ($url_sites as $url_site) {
         if (strlen($url_site) > 0) {
             $meta_ids[] = 'site:' . $url_site;
         }
     }
     $path = UrlParser::getPath($site[CrawlConstants::URL]);
     if (strlen($path) > 0) {
         $path_parts = explode("/", $path);
         $pre_path = "";
         $meta_ids[] = 'path:all';
         $meta_ids[] = 'path:/';
         foreach ($path_parts as $part) {
             if (strlen($part) > 0) {
                 $pre_path .= "/{$part}";
                 $meta_ids[] = 'path:' . $pre_path;
             }
         }
     }
     $meta_ids[] = 'info:' . $site[CrawlConstants::URL];
     $meta_ids[] = 'info:' . crawlHash($site[CrawlConstants::URL]);
     $meta_ids[] = 'code:all';
     $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE];
     if (UrlParser::getHost($site[CrawlConstants::URL]) . "/" == $site[CrawlConstants::URL]) {
         $meta_ids[] = 'host:all';
         //used to count number of distinct hosts
     }
     if (isset($site[CrawlConstants::SIZE])) {
         $meta_ids[] = "size:all";
         $interval = DOWNLOAD_SIZE_INTERVAL;
         $size = floor($site[CrawlConstants::SIZE] / $interval) * $interval;
         $meta_ids[] = "size:{$size}";
     }
     if (isset($site[CrawlConstants::TOTAL_TIME])) {
         $meta_ids[] = "time:all";
         $interval = DOWNLOAD_TIME_INTERVAL;
         $time = floor($site[CrawlConstants::TOTAL_TIME] / $interval) * $interval;
         $meta_ids[] = "time:{$time}";
     }
     if (isset($site[CrawlConstants::DNS_TIME])) {
         $meta_ids[] = "dns:all";
         $interval = DOWNLOAD_TIME_INTERVAL;
         $time = floor($site[CrawlConstants::DNS_TIME] / $interval) * $interval;
         $meta_ids[] = "dns:{$time}";
     }
     if (isset($site[CrawlConstants::LINKS])) {
         $num_links = count($site[CrawlConstants::LINKS]);
         $meta_ids[] = "numlinks:all";
         $meta_ids[] = "numlinks:{$num_links}";
         $link_urls = array_keys($site[CrawlConstants::LINKS]);
         $meta_ids[] = "link:all";
         foreach ($link_urls as $url) {
             $meta_ids[] = 'link:' . $url;
             $meta_ids[] = 'link:' . crawlHash($url);
         }
     }
     if (isset($site[CrawlConstants::LOCATION]) && is_array($site[CrawlConstants::LOCATION])) {
         foreach ($site[CrawlConstants::LOCATION] as $location) {
             $meta_ids[] = 'info:' . $location;
             $meta_ids[] = 'info:' . crawlHash($location);
             $meta_ids[] = 'location:all';
             $meta_ids[] = 'location:' . $location;
         }
     }
     if (isset($site[CrawlConstants::IP_ADDRESSES])) {
         $meta_ids[] = 'ip:all';
         foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) {
             $meta_ids[] = 'ip:' . $address;
         }
     }
     $meta_ids[] = 'media:all';
     if ($video_sources != array()) {
         if (UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) {
             $meta_ids[] = "media:video";
         } else {
             $meta_ids[] = stripos($site[CrawlConstants::TYPE], "image") !== false ? 'media:image' : 'media:text';
         }
     }
     // store the filetype info
     $url_type = UrlParser::getDocumentType($site[CrawlConstants::URL]);
     if (strlen($url_type) > 0) {
         $meta_ids[] = 'filetype:all';
         $meta_ids[] = 'filetype:' . $url_type;
     }
     if (isset($site[CrawlConstants::SERVER])) {
         $meta_ids[] = 'server:all';
         $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]);
     }
     if (isset($site[CrawlConstants::SERVER_VERSION])) {
         $meta_ids[] = 'version:all';
         $meta_ids[] = 'version:' . $site[CrawlConstants::SERVER_VERSION];
     }
     if (isset($site[CrawlConstants::OPERATING_SYSTEM])) {
         $meta_ids[] = 'os:all';
         $meta_ids[] = 'os:' . strtolower($site[CrawlConstants::OPERATING_SYSTEM]);
     }
     if (isset($site[CrawlConstants::MODIFIED])) {
         $modified = $site[CrawlConstants::MODIFIED];
         $meta_ids[] = 'modified:all';
         $meta_ids[] = 'modified:' . date('Y', $modified);
         $meta_ids[] = 'modified:' . date('Y-m', $modified);
         $meta_ids[] = 'modified:' . date('Y-m-d', $modified);
     }
     if (isset($site[CrawlConstants::TIMESTAMP])) {
         $date = $site[CrawlConstants::TIMESTAMP];
         $meta_ids[] = 'date:all';
         $meta_ids[] = 'date:' . date('Y', $date);
         $meta_ids[] = 'date:' . date('Y-m', $date);
         $meta_ids[] = 'date:' . date('Y-m-d', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date);
     }
     if (isset($site[CrawlConstants::LANG])) {
         $meta_ids[] = 'lang:all';
         $lang_parts = explode("-", $site[CrawlConstants::LANG]);
         $meta_ids[] = 'lang:' . $lang_parts[0];
         if (isset($lang_parts[1])) {
             $meta_ids[] = 'lang:' . $site[CrawlConstants::LANG];
         }
     }
     if (isset($site[CrawlConstants::AGENT_LIST])) {
         foreach ($site[CrawlConstants::AGENT_LIST] as $agent) {
             $meta_ids[] = 'robot:' . strtolower($agent);
         }
     }
     //Add all meta word for subdoctype
     if (isset($site[CrawlConstants::SUBDOCTYPE])) {
         $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all';
     }
     return $meta_ids;
 }
Пример #3
0
    /**
     * Outputs a profile.php  file in the given directory containing profile
     * data based on new and old data sources
     *
     * This function creates a profile.php file if it doesn't exist. A given
     * field is output in the profile
     * according to the precedence that a new value is preferred to an old
     * value is prefered to the value that comes from a currently defined
     * constant. It might be the case that a new value for a given field
     * doesn't exist, etc.
     *
     * @param string $directory the work directory to output the profile.php
     *     file
     * @param array $new_profile_data fields and values containing at least
     *     some profile information (only $this->profile_fields
     * fields of $new_profile_data will be considered).
     * @param array $old_profile_data fields and values that come from
     *     presumably a previously existing profile
     * @param bool whether the new profile data is coming from a reset to
     *      factory settings or not
     */
    function updateProfile($directory, $new_profile_data, $old_profile_data, $reset = false)
    {
        $n = array();
        $n[] = <<<EOT
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009-2012  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * Computer generated file giving the key defines of directory locations
 * as well as database settings used to run the SeekQuarry/Yioop search engine
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage config
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009-2012
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
EOT;
        $base_url = NAME_SERVER;
        if (defined("BASE_URL")) {
            $base_url = BASE_URL;
        }
        //make sure certain fields are not null
        $not_null_fields = array('BACKGROUND_COLOR' => "#FFF", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AD_LOCATION' => 'none');
        $not_null_keys = array_keys($not_null_fields);
        $file_fields = $this->file_fields;
        //now integrate the different profiles
        foreach ($this->profile_fields as $field) {
            if (isset($new_profile_data[$field])) {
                if (!$reset && in_array($field, array('LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH', 'BACKGROUND_IMAGE'))) {
                    if (isset($new_profile_data[$field]['name']) && isset($new_profile_data[$field]['tmp_name'])) {
                        move_uploaded_file($new_profile_data[$field]['tmp_name'], APP_DIR . "/resources/" . $new_profile_data[$field]['name']);
                        $profile[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $new_profile_data[$field]['name'];
                    } else {
                        if (isset($old_profile_data[$field])) {
                            $profile[$field] = $old_profile_data[$field];
                        } else {
                            if (defined($field)) {
                                $profile[$field] = constant($field);
                            } else {
                                $profile[$field] = "";
                            }
                        }
                    }
                } else {
                    $profile[$field] = $new_profile_data[$field];
                }
            } else {
                if (isset($old_profile_data[$field])) {
                    $profile[$field] = $old_profile_data[$field];
                } else {
                    if (defined($field)) {
                        $profile[$field] = constant($field);
                    } else {
                        $profile[$field] = "";
                    }
                }
            }
            if (!$profile[$field] && isset($not_null_fields[$field])) {
                $profile[$field] = $not_null_fields[$field];
            }
            if ($field == "NEWS_MODE" && $profile[$field] == "") {
                $profile[$field] = "news_off";
            }
            if ($field == "WEB_URI") {
                if (isset($_SERVER['REQUEST_URI'])) {
                    $profile[$field] = UrlParser::getPath($_SERVER['REQUEST_URI']);
                } else {
                    $profile[$field] = UrlParser::getPath(NAME_SERVER);
                }
            }
            if (in_array($field, $file_fields)) {
                continue;
            }
            if ($field != "DEBUG_LEVEL") {
                $profile[$field] = "\"{$profile[$field]}\"";
            }
            $n[] = "define('{$field}', {$profile[$field]});";
        }
        $out = implode("\n", $n);
        if (file_put_contents($directory . PROFILE_FILE_NAME, $out) !== false) {
            restore_error_handler();
            @chmod($directory . PROFILE_FILE_NAME, 0777);
            if (isset($new_profile_data['AUXILIARY_CSS'])) {
                if (!file_exists(APP_DIR . "/css")) {
                    @mkdir(APP_DIR . "/css");
                    @chmod(APP_DIR . "/css", 0777);
                }
                $css_file = APP_DIR . "/css/auxiliary.css";
                file_put_contents($css_file, $new_profile_data['AUXILIARY_CSS']);
                @chmod($css_file, 0777);
            }
            set_error_handler("yioop_error_handler");
            return true;
        }
        return false;
    }
Пример #4
0
 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }