/** * Responsible for handling admin request related to the configure activity * * The configure activity allows a user to set the work directory for * storing data local to this SeekQuarry/Yioop instance. It also allows one * to set the default language of the installation, dbms info, robot info, * test info, as well as which machine acts as the queue server. * * @return array $data fields for available language, dbms, etc as well as * results of processing sub activity if any */ function configure() { $parent = $this->parent; $profile_model = $parent->model("profile"); $group_model = $parent->model("group"); $data = array(); $profile = array(); $data['SYSTEM_CHECK'] = $this->systemCheck(); $languages = $parent->model("locale")->getLocaleList(); foreach ($languages as $language) { $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME']; } if (isset($_REQUEST['lang']) && $_REQUEST['lang']) { $data['lang'] = $parent->clean($_REQUEST['lang'], "string"); $profile['DEFAULT_LOCALE'] = $data['lang']; setLocaleObject($data['lang']); } $data["ELEMENT"] = "configure"; $data['SCRIPT'] = ""; $data['PROFILE'] = false; if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) { if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) { $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY; $_REQUEST['arg'] = "directory"; @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php"); } $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string"); $data['PROFILE'] = true; if (strstr(PHP_OS, "WIN")) { //convert to forward slashes so consistent with rest of code $dir = str_replace("\\", "/", $dir); if ($dir[0] != "/" && $dir[1] != ":") { $data['PROFILE'] = false; } } else { if ($dir[0] != "/") { $data['PROFILE'] = false; } } if ($data['PROFILE'] == false) { $data["MESSAGE"] = tl('system_component_configure_use_absolute_path'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } if (strstr($dir . "/", BASE_DIR . "/")) { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data['WORK_DIRECTORY'] = $dir; return $data; } $data['WORK_DIRECTORY'] = $dir; } else { if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) { $data['WORK_DIRECTORY'] = WORK_DIRECTORY; $data['PROFILE'] = true; } } $arg = ""; if (isset($_REQUEST['arg'])) { $arg = $_REQUEST['arg']; } switch ($arg) { case "directory": if (!isset($data['WORK_DIRECTORY'])) { break; } if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_set'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);"; } else { if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) { if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) { $profile['DBMS'] = 'sqlite3'; $data['DBMS'] = 'sqlite3'; $profile['DB_NAME'] = 'default'; $data['DB_NAME'] = 'default'; $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot'); $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT']; $uri = UrlParser::getPath($_SERVER['REQUEST_URI']); $http = isset($_SERVER['HTTPS']) ? "https://" : "http://"; $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri; $data['NAME_SERVER'] = $profile['NAME_SERVER']; $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time()); $data['AUTH_KEY'] = $profile['AUTH_KEY']; $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus(); $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time(); $profile['ROBOT_INSTANCE'] = $robot_instance; $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE']; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) { if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) { $data["MESSAGE"] = tl('system_component_configure_work_profile_made'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); $data['PROFILE'] = true; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_set_config'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_create_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);"; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } else { $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']); $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);"; $data['PROFILE'] = false; } } break; case "profile": $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE')); $data['DEBUG_LEVEL'] = 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0; $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0; $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL']; $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); $folder = APP_DIR . "/resources"; if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) { $data["MESSAGE"] = tl('system_component_no_resource_folder'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) { if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") { if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') { $data["MESSAGE"] = tl('system_component_invalid_filetype'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } if ($_FILES[$field]['size'] > THUMB_SIZE) { $data["MESSAGE"] = tl('system_component_file_too_big'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')"; return $data; } $profile[$field] = array(); $profile[$field]['name'] = $_FILES[$field]['name']; $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name']; $data[$field] = "./?c=resource&a=get&" . "f=resources&n=" . $profile[$field]['name']; } } if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) { $data['MESSAGE'] = tl('system_component_configure_profile_change'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) { $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&a=configure&" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);"; } } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; case "reset": $base_url = NAME_SERVER; if (defined("BASE_URL")) { $base_url = BASE_URL; } $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => ""); $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']); foreach ($old_profile as $key => $value) { $data[$key] = $value; } $tmp_image = $old_profile['BACKGROUND_IMAGE']; $old_profile['BACKGROUND_IMAGE'] = ""; if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) { $old_profile['BACKGROUND_IMAGE'] = $tmp_image; foreach ($profile as $key => $value) { $data[$key] = $value; if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") { $resource_name = APP_DIR . "/resources/" . $old_profile[$key]; if (file_exists($resource_name)) { unlink($resource_name); } } } $data['MESSAGE'] = tl('system_component_configure_reset_completed'); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');"; } else { $data['PROFILE'] = false; $data["MESSAGE"] = tl('system_component_configure_no_change_profile'); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');"; break; } break; default: if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) { $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY'])); } else { $data['WORK_DIRECTORY'] = ""; $data['PROFILE'] = false; } } $data['advanced'] = "false"; if ($data['PROFILE']) { $locale_tag = getLocaleTag(); $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN"); foreach ($not_null_fields as $field => $default) { if (!$data[$field]) { $data[$field] = $default; } } if (isset($_REQUEST['ROBOT_DESCRIPTION'])) { $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN); $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", ""); } $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit"); $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot'); if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') { $data['advanced'] = "true"; } $data['SCRIPT'] .= <<<EOD setDisplay('advance-configure', {$data['advanced']}); setDisplay('advance-robot', {$data['advanced']}); function toggleAdvance() { var advanced = elt('a-settings'); advanced.value = (advanced.value =='true') ? 'false' : 'true'; var value = (advanced.value == 'true') ? true : false; setDisplay('advance-configure', value); setDisplay('advance-robot', value); } EOD; } $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n"; return $data; }
/** * Calculates the meta words to be associated with a given downloaded * document. These words will be associated with the document in the * index for (server:apache) even if the document itself did not contain * them. * * @param array& $site associated array containing info about a downloaded * (or read from archive) document. * @param array $video_sources used to check if a page should be marked as * having meta media:video * @return array of meta words to be associate with this document */ static function calculateMetas(&$site, $video_sources = array()) { $meta_ids = array(); // handles user added meta words if (isset($site[CrawlConstants::META_WORDS])) { $meta_ids = $site[CrawlConstants::META_WORDS]; } /* Handle the built-in meta words. For example store the sites the doc_key belongs to, so you can search by site */ $url_sites = UrlParser::getHostPaths($site[CrawlConstants::URL]); $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[CrawlConstants::URL])); $meta_ids[] = 'site:all'; foreach ($url_sites as $url_site) { if (strlen($url_site) > 0) { $meta_ids[] = 'site:' . $url_site; } } $path = UrlParser::getPath($site[CrawlConstants::URL]); if (strlen($path) > 0) { $path_parts = explode("/", $path); $pre_path = ""; $meta_ids[] = 'path:all'; $meta_ids[] = 'path:/'; foreach ($path_parts as $part) { if (strlen($part) > 0) { $pre_path .= "/{$part}"; $meta_ids[] = 'path:' . $pre_path; } } } $meta_ids[] = 'info:' . $site[CrawlConstants::URL]; $meta_ids[] = 'info:' . crawlHash($site[CrawlConstants::URL]); $meta_ids[] = 'code:all'; $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE]; if (UrlParser::getHost($site[CrawlConstants::URL]) . "/" == $site[CrawlConstants::URL]) { $meta_ids[] = 'host:all'; //used to count number of distinct hosts } if (isset($site[CrawlConstants::SIZE])) { $meta_ids[] = "size:all"; $interval = DOWNLOAD_SIZE_INTERVAL; $size = floor($site[CrawlConstants::SIZE] / $interval) * $interval; $meta_ids[] = "size:{$size}"; } if (isset($site[CrawlConstants::TOTAL_TIME])) { $meta_ids[] = "time:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::TOTAL_TIME] / $interval) * $interval; $meta_ids[] = "time:{$time}"; } if (isset($site[CrawlConstants::DNS_TIME])) { $meta_ids[] = "dns:all"; $interval = DOWNLOAD_TIME_INTERVAL; $time = floor($site[CrawlConstants::DNS_TIME] / $interval) * $interval; $meta_ids[] = "dns:{$time}"; } if (isset($site[CrawlConstants::LINKS])) { $num_links = count($site[CrawlConstants::LINKS]); $meta_ids[] = "numlinks:all"; $meta_ids[] = "numlinks:{$num_links}"; $link_urls = array_keys($site[CrawlConstants::LINKS]); $meta_ids[] = "link:all"; foreach ($link_urls as $url) { $meta_ids[] = 'link:' . $url; $meta_ids[] = 'link:' . crawlHash($url); } } if (isset($site[CrawlConstants::LOCATION]) && is_array($site[CrawlConstants::LOCATION])) { foreach ($site[CrawlConstants::LOCATION] as $location) { $meta_ids[] = 'info:' . $location; $meta_ids[] = 'info:' . crawlHash($location); $meta_ids[] = 'location:all'; $meta_ids[] = 'location:' . $location; } } if (isset($site[CrawlConstants::IP_ADDRESSES])) { $meta_ids[] = 'ip:all'; foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) { $meta_ids[] = 'ip:' . $address; } } $meta_ids[] = 'media:all'; if ($video_sources != array()) { if (UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) { $meta_ids[] = "media:video"; } else { $meta_ids[] = stripos($site[CrawlConstants::TYPE], "image") !== false ? 'media:image' : 'media:text'; } } // store the filetype info $url_type = UrlParser::getDocumentType($site[CrawlConstants::URL]); if (strlen($url_type) > 0) { $meta_ids[] = 'filetype:all'; $meta_ids[] = 'filetype:' . $url_type; } if (isset($site[CrawlConstants::SERVER])) { $meta_ids[] = 'server:all'; $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]); } if (isset($site[CrawlConstants::SERVER_VERSION])) { $meta_ids[] = 'version:all'; $meta_ids[] = 'version:' . $site[CrawlConstants::SERVER_VERSION]; } if (isset($site[CrawlConstants::OPERATING_SYSTEM])) { $meta_ids[] = 'os:all'; $meta_ids[] = 'os:' . strtolower($site[CrawlConstants::OPERATING_SYSTEM]); } if (isset($site[CrawlConstants::MODIFIED])) { $modified = $site[CrawlConstants::MODIFIED]; $meta_ids[] = 'modified:all'; $meta_ids[] = 'modified:' . date('Y', $modified); $meta_ids[] = 'modified:' . date('Y-m', $modified); $meta_ids[] = 'modified:' . date('Y-m-d', $modified); } if (isset($site[CrawlConstants::TIMESTAMP])) { $date = $site[CrawlConstants::TIMESTAMP]; $meta_ids[] = 'date:all'; $meta_ids[] = 'date:' . date('Y', $date); $meta_ids[] = 'date:' . date('Y-m', $date); $meta_ids[] = 'date:' . date('Y-m-d', $date); $meta_ids[] = 'date:' . date('Y-m-d-H', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date); $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date); } if (isset($site[CrawlConstants::LANG])) { $meta_ids[] = 'lang:all'; $lang_parts = explode("-", $site[CrawlConstants::LANG]); $meta_ids[] = 'lang:' . $lang_parts[0]; if (isset($lang_parts[1])) { $meta_ids[] = 'lang:' . $site[CrawlConstants::LANG]; } } if (isset($site[CrawlConstants::AGENT_LIST])) { foreach ($site[CrawlConstants::AGENT_LIST] as $agent) { $meta_ids[] = 'robot:' . strtolower($agent); } } //Add all meta word for subdoctype if (isset($site[CrawlConstants::SUBDOCTYPE])) { $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all'; } return $meta_ids; }
/** * Outputs a profile.php file in the given directory containing profile * data based on new and old data sources * * This function creates a profile.php file if it doesn't exist. A given * field is output in the profile * according to the precedence that a new value is preferred to an old * value is prefered to the value that comes from a currently defined * constant. It might be the case that a new value for a given field * doesn't exist, etc. * * @param string $directory the work directory to output the profile.php * file * @param array $new_profile_data fields and values containing at least * some profile information (only $this->profile_fields * fields of $new_profile_data will be considered). * @param array $old_profile_data fields and values that come from * presumably a previously existing profile * @param bool whether the new profile data is coming from a reset to * factory settings or not */ function updateProfile($directory, $new_profile_data, $old_profile_data, $reset = false) { $n = array(); $n[] = <<<EOT <?php /** * SeekQuarry/Yioop -- * Open Source Pure PHP Search Engine, Crawler, and Indexer * * Copyright (C) 2009-2012 Chris Pollett chris@pollett.org * * LICENSE: * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * END LICENSE * * Computer generated file giving the key defines of directory locations * as well as database settings used to run the SeekQuarry/Yioop search engine * * @author Chris Pollett chris@pollett.org * @package seek_quarry * @subpackage config * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009-2012 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} EOT; $base_url = NAME_SERVER; if (defined("BASE_URL")) { $base_url = BASE_URL; } //make sure certain fields are not null $not_null_fields = array('BACKGROUND_COLOR' => "#FFF", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AD_LOCATION' => 'none'); $not_null_keys = array_keys($not_null_fields); $file_fields = $this->file_fields; //now integrate the different profiles foreach ($this->profile_fields as $field) { if (isset($new_profile_data[$field])) { if (!$reset && in_array($field, array('LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH', 'BACKGROUND_IMAGE'))) { if (isset($new_profile_data[$field]['name']) && isset($new_profile_data[$field]['tmp_name'])) { move_uploaded_file($new_profile_data[$field]['tmp_name'], APP_DIR . "/resources/" . $new_profile_data[$field]['name']); $profile[$field] = "./?c=resource&a=get&" . "f=resources&n=" . $new_profile_data[$field]['name']; } else { if (isset($old_profile_data[$field])) { $profile[$field] = $old_profile_data[$field]; } else { if (defined($field)) { $profile[$field] = constant($field); } else { $profile[$field] = ""; } } } } else { $profile[$field] = $new_profile_data[$field]; } } else { if (isset($old_profile_data[$field])) { $profile[$field] = $old_profile_data[$field]; } else { if (defined($field)) { $profile[$field] = constant($field); } else { $profile[$field] = ""; } } } if (!$profile[$field] && isset($not_null_fields[$field])) { $profile[$field] = $not_null_fields[$field]; } if ($field == "NEWS_MODE" && $profile[$field] == "") { $profile[$field] = "news_off"; } if ($field == "WEB_URI") { if (isset($_SERVER['REQUEST_URI'])) { $profile[$field] = UrlParser::getPath($_SERVER['REQUEST_URI']); } else { $profile[$field] = UrlParser::getPath(NAME_SERVER); } } if (in_array($field, $file_fields)) { continue; } if ($field != "DEBUG_LEVEL") { $profile[$field] = "\"{$profile[$field]}\""; } $n[] = "define('{$field}', {$profile[$field]});"; } $out = implode("\n", $n); if (file_put_contents($directory . PROFILE_FILE_NAME, $out) !== false) { restore_error_handler(); @chmod($directory . PROFILE_FILE_NAME, 0777); if (isset($new_profile_data['AUXILIARY_CSS'])) { if (!file_exists(APP_DIR . "/css")) { @mkdir(APP_DIR . "/css"); @chmod(APP_DIR . "/css", 0777); } $css_file = APP_DIR . "/css/auxiliary.css"; file_put_contents($css_file, $new_profile_data['AUXILIARY_CSS']); @chmod($css_file, 0777); } set_error_handler("yioop_error_handler"); return true; } return false; }
/** * Make multi_curl requests for an array of sites with urls or onion urls * * @param array $sites an array containing urls of pages to request * @param bool $timer flag, true means print timing statistics to log * @param int $page_range_request maximum number of bytes to download/page * 0 means download all * @param string $temp_dir folder to store temporary ip header info * @param string $key the component of $sites[$i] that has the value of * a url to get defaults to URL * @param string $value component of $sites[$i] in which to store the * page that was gotten * @param bool $minimal if true do a faster request of pages by not * doing things like extract HTTP headers sent, etcs * @param array $post_data data to be POST'd to each site * @param bool $follow whether to follow redirects or not * @param string $tor_proxy url of a proxy that knows how to download * .onion urls * @param array $proxy_servers if not array(), then an array of proxy * server to use rather than to directly download web pages from * the current machine * * @return array an updated array with the contents of those pages */ static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array()) { $agent_handler = curl_multi_init(); $active = NULL; $start_time = microtime(); if (!$minimal && $temp_dir == NULL) { $temp_dir = CRAWL_DIR . "/temp"; if (!file_exists($temp_dir)) { mkdir($temp_dir); } } //Set-up requests $num_sites = count($sites); for ($i = 0; $i < $num_sites; $i++) { $is_gopher = false; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; if (isset($sites[$i][$key])) { list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers); if ($headers == "gopher") { $is_gopher = true; $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher; $headers = array(); } $sites[$i][0] = curl_init(); if (!$minimal) { $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+'); curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]); curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true); } curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT); curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER); curl_setopt($sites[$i][0], CURLOPT_URL, $url); if (strcmp(substr($url, -10), "robots.txt") == 0) { $sites[$i]['ROBOT'] = true; $follow = true; /*wikipedia redirects their robot page. grr want to force this for robots pages */ } curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow); curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true); curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true); curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT); curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT); if (stripos($url, '.onion') !== false && $tor_proxy != "") { curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy); //CURLPROXY_SOCKS5_HOSTNAME = 7 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7); if ($timer) { crawlLog("Using Tor proxy for {$url}.."); } } else { if ($proxy_servers != array() && !$is_gopher) { $select_proxy = rand(0, count($proxy_servers) - 1); $proxy_server = $proxy_servers[$select_proxy]; $proxy_parts = explode(":", $proxy_server); $proxy_ip = $proxy_parts[0]; if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') { $proxy_type = CURLPROXY_HTTP; } else { if (strtolower($proxy_parts[2]) == 'socks5') { $proxy_type = CURLPROXY_SOCKS5; } else { $proxy_type = $proxy_parts[2]; } } if (isset($proxy_parts[1])) { $proxy_port = $proxy_parts[1]; } else { $proxy_port = "80"; } curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}"); curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type); if ($timer) { crawlLog("Selecting proxy {$select_proxy} for {$url}"); } } } if (!$minimal) { curl_setopt($sites[$i][0], CURLOPT_HEADER, true); } //make lighttpd happier if (!$is_gopher) { curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers); } curl_setopt($sites[$i][0], CURLOPT_ENCODING, ""); // ^ need to set for sites like att that use gzip if ($page_range_request > 0) { curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request); } if ($post_data != NULL) { curl_setopt($sites[$i][0], CURLOPT_POST, true); curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]); } curl_multi_add_handle($agent_handler, $sites[$i][0]); } } if ($timer) { crawlLog(" Init Get Pages " . changeInMicrotime($start_time)); } $start_time = microtime(); $start = time(); //Wait for responses $running = NULL; $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7; do { $mrc = curl_multi_exec($agent_handler, $running); $ready = curl_multi_select($agent_handler, 0.005); } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0); if (time() - $start > PAGE_TIMEOUT && $timer) { crawlLog(" TIMED OUT!!!"); } if ($timer) { crawlLog(" Page Request time " . changeInMicrotime($start_time)); } $start_time = microtime(); //Process returned pages for ($i = 0; $i < $num_sites; $i++) { if ($timer) { crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites); } if (!$minimal && isset($ip_holder[$i])) { rewind($ip_holder[$i]); $header = fread($ip_holder[$i], 8192); $ip_addresses = self::getCurlIp($header); fclose($ip_holder[$i]); } $is_gopher = false; if (isset($sites[$i][0]) && $sites[$i][0]) { // Get Data and Message Code $content = @curl_multi_getcontent($sites[$i][0]); $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL]; /* If the Transfer-encoding was chunked then the Range header we sent was ignored. So we manually truncate the data here */ if ($page_range_request > 0) { $content = substr($content, 0, $page_range_request); } if (isset($content) && !$minimal && !$is_gopher) { $site = self::parseHeaderPage($content, $value); $sites[$i] = array_merge($sites[$i], $site); if (isset($header)) { $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4); } else { $header = ""; } $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER]; unset($header); } else { if (isset($content) && !$minimal && $is_gopher) { $sites[$i][CrawlConstants::HEADER] = $header; $sites[$i][$value] = $content; unset($header); } else { $sites[$i][$value] = $content; } } if (!$minimal) { $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD); $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME); $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME); $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE); if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) { $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]); } else { $sites[$i][self::HTTP_CODE] = 200; } if ($ip_addresses) { $sites[$i][self::IP_ADDRESSES] = $ip_addresses; } else { $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0"); } //Get Time, Mime type and Character encoding $sites[$i][self::TIMESTAMP] = time(); if ($is_gopher) { $path = UrlParser::getPath($sites[$i][self::URL]); $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]); if (isset($path[1])) { $gopher_type = $path[1]; } else { $gopher_type = 1; } if ($gopher_type == 1) { $sites[$i][self::TYPE] = "text/gopher"; } else { if (in_array($gopher_type, array(0, 3, 6))) { $sites[$i][self::TYPE] = "text/plain"; if ($gopher_type == 6) { $sites[$i][$value] = convert_uudecode($content); } } else { if ($gopher_type == 'h') { $sites[$i][self::TYPE] = "text/html"; } else { if ($gopher_type == 'g') { $sites[$i][self::TYPE] = "image/gif"; } } } } $path_info = pathinfo($filename); if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) { $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename); } else { if (!isset($sites[$i][self::TYPE])) { $sites[$i][self::TYPE] = "unknown"; } } } else { $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE)); $sites[$i][self::TYPE] = strtolower(trim($type_parts[0])); } } //curl_multi_remove_handle($agent_handler, $sites[$i][0]); curl_close($sites[$i][0]); if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) { if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) { $sites[$i][self::TYPE] = "text/plain"; $sites[$i][self::HTTP_CODE] = "200"; $tmp = wordwrap($sites[$i][$value], 80); $tmp_parts = explode("\n", $tmp); $tmp = "# Suspect server misconfiguration\n"; $tmp .= "# Assume shouldn't crawl this site.\n"; $tmp .= "# Pretending got following robots.txt.\n"; $tmp .= "User-agent: *\n"; $tmp .= "Disallow: /\n"; $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n"; $tmp .= "# Original content:\n"; foreach ($tmp_parts as $part) { $tmp = "#" . $part . "\n"; } $sites[$i][$value] = $tmp; $sites[$i][self::HTTP_CODE] = "200"; unset($site[CrawlConstants::LOCATION]); } } } //end big if } //end for if ($timer) { crawlLog(" Get Page Content time " . changeInMicrotime($start_time)); } curl_multi_close($agent_handler); return $sites; }