/** * Creates an text archive iterator with the given parameters. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over. If this * iterator is used in a fetcher and the data is on a name server * set this to false * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to * @param array $ini describes start_ and end_delimiter, file_extension, * encoding, and compression method used for pages in this archive */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini = array()) { $this->iterate_timestamp = $iterate_timestamp; $this->iterate_dir = $iterate_dir; $this->result_timestamp = $result_timestamp; $this->result_dir = $result_dir; if (!file_exists($result_dir)) { mkdir($result_dir); } $this->partitions = array(); if ($this->iterate_dir != false) { // false =network/fetcher iterator if ($ini == array()) { $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini"); } $extension = $ini['file_extension']; } $this->setIniInfo($ini); if ($this->start_delimiter == "" && $this->end_delimiter == "" && $this->iterate_dir != false) { crawlLog("At least one of start or end delimiter must be set!!"); exit; } if ($this->iterate_dir != false) { foreach (glob("{$this->iterate_dir}/*.{$extension}", GLOB_BRACE) as $filename) { $this->partitions[] = $filename; } } $this->num_partitions = count($this->partitions); $this->status_filename = "{$this->result_dir}/iterate_status.txt"; $this->buffer_filename = $this->result_dir . "/buffer.txt"; if (file_exists($this->status_filename)) { $this->restoreCheckpoint(); } else { $this->reset(); } }
/** * Creates an database archive iterator with the given parameters. This * kind of iterator is used to cycle through the results of a SQL query * to a database, so that the results might be indexed by Yioop. * * @param string $iterate_timestamp timestamp of the arc archive bundle to * iterate over the pages of * @param string $iterate_dir folder of files to iterate over * @param string $result_timestamp timestamp of the arc archive bundle * results are being stored in * @param string $result_dir where to write last position checkpoints to */ function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir) { $this->iterate_timestamp = $iterate_timestamp; $this->iterate_dir = $iterate_dir; $this->result_timestamp = $result_timestamp; $this->result_dir = $result_dir; $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini"); $this->dbinfo = array("DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, "DB_PASSWORD" => DB_PASSWORD); foreach ($this->dbinfo as $key => $value) { $ini_key = strtolower($key); if (isset($ini[$ini_key])) { $this->dbinfo[$key] = $ini[$ini_key]; } } $db_class = ucfirst($this->dbinfo["DBMS"]) . "Manager"; $this->db = new $db_class(); $this->db->connect($this->dbinfo['DB_HOST'], $this->dbinfo['DB_USER'], $this->dbinfo['DB_PASSWORD'], $this->dbinfo['DB_NAME']); if (isset($ini['sql'])) { $this->sql = $ini['sql']; } else { crawlLog("Database Archive Iterator needs a SQL statement to run"); exit; } if (isset($ini['field_value_separator'])) { $this->field_value_separator = $ini['field_value_separator']; } else { $this->field_value_separator = "\n----\n"; } if (isset($ini['column_separator'])) { $this->column_separator = $ini['column_separator']; } else { $this->column_separator = "\n====\n"; } if (isset($ini['encoding'])) { $this->encoding = $ini['encoding']; } else { $this->encoding = "UTF-8"; } if (!file_exists($result_dir)) { mkdir($result_dir); } if (file_exists("{$this->result_dir}/iterate_status.txt")) { $this->restoreCheckpoint(); } else { $this->reset(); } }
/** * Gets a list of all index archives of crawls that have been conducted * * @param bool $return_arc_bundles whether index bundles used for indexing * arc or other archive bundles should be included in the lsit * @param bool $return_recrawls whether index archive bundles generated as * a result of recrawling should be included in the result * @param array $machine_urls an array of urls of yioop queue servers * @param bool $cache whether to try to get/set the data to a cache file * * @return array available IndexArchiveBundle directories and * their meta information this meta information includes the time of * the crawl, its description, the number of pages downloaded, and the * number of partitions used in storing the inverted index */ function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false) { if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) { $arg = $return_arc_bundles && $return_recrawls ? 3 : ($return_recrawls ? 2 : ($return_arc_bundles ? 1 : 0)); $cache_file = CRAWL_DIR . "/cache/" . self::network_crawllist_base_name . "{$arg}.txt"; if ($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time()) { return unserialize(file_get_contents($cache_file)); } $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg); $list = $this->aggregateCrawlList($list_strings); if ($cache) { file_put_contents($cache_file, serialize($list)); } return $list; } $list = array(); $dirs = glob(CRAWL_DIR . '/cache/' . self::index_data_base_name . '*', GLOB_ONLYDIR); foreach ($dirs as $dir) { $crawl = array(); $pre_timestamp = strstr($dir, self::index_data_base_name); $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name)); $info = IndexArchiveBundle::getArchiveInfo($dir); $index_info = @unserialize($info['DESCRIPTION']); $crawl['DESCRIPTION'] = ""; if (!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { continue; } else { if ($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) { $crawl['DESCRIPTION'] = "RECRAWL::"; } } $sched_path = CRAWL_DIR . '/schedules/' . self::schedule_data_base_name . $crawl['CRAWL_TIME']; $crawl['RESUMABLE'] = false; if (is_dir($sched_path)) { $sched_dir = opendir($sched_path); while (($name = readdir($sched_dir)) !== false) { $sub_path = "{$sched_path}/{$name}"; if (!is_dir($sub_path) || $name == '.' || $name == '..') { continue; } $sub_dir = opendir($sub_path); $i = 0; while (($sub_name = readdir($sub_dir)) !== false && $i < 5) { if ($sub_name[0] == 'A' && $sub_name[1] == 't') { $crawl['RESUMABLE'] = true; break 2; } } closedir($sub_dir); } closedir($sched_dir); } if (isset($index_info['DESCRIPTION'])) { $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION']; } $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0; $crawl['COUNT'] = isset($info['COUNT']) ? $info['COUNT'] : 0; $crawl['NUM_DOCS_PER_PARTITION'] = isset($info['NUM_DOCS_PER_PARTITION']) ? $info['NUM_DOCS_PER_PARTITION'] : 0; $crawl['WRITE_PARTITION'] = isset($info['WRITE_PARTITION']) ? $info['WRITE_PARTITION'] : 0; $list[] = $crawl; } if ($return_arc_bundles) { $dirs = glob(CRAWL_DIR . '/archives/*', GLOB_ONLYDIR); foreach ($dirs as $dir) { $crawl = array(); $crawl['CRAWL_TIME'] = crc32($dir); $crawl['DESCRIPTION'] = "ARCFILE::"; $crawl['ARC_DIR'] = $dir; $ini_file = "{$dir}/arc_description.ini"; if (!file_exists($ini_file)) { continue; } else { $ini = parse_ini_with_fallback($ini_file); $crawl['ARC_TYPE'] = $ini['arc_type']; $crawl['DESCRIPTION'] .= $ini['description']; } $crawl['VISITED_URLS_COUNT'] = 0; $crawl['COUNT'] = 0; $crawl['NUM_DOCS_PER_PARTITION'] = 0; $crawl['WRITE_PARTITION'] = 0; $list[] = $crawl; } } return $list; }
/** * Given a folder name, determines the kind of bundle (if any) it holds. * It does this based on the expected location of the description.txt file, * or arc_description.ini (in the case of a non-yioop archive) * * @param string $archive_path the path to archive folder * @return string the archive bundle type, either: WebArchiveBundle or * IndexArchiveBundle */ function getArchiveKind($archive_path) { if (file_exists("{$archive_path}/description.txt")) { return "WebArchiveBundle"; } if (file_exists("{$archive_path}/summaries/description.txt")) { return "IndexArchiveBundle"; } $desc_path = "{$archive_path}/arc_description.ini"; if (file_exists($desc_path)) { $desc = parse_ini_with_fallback($desc_path); if (!isset($desc['arc_type'])) { return false; } return $desc['arc_type']; } return false; }
/** * Updates the configure.ini file and static pages for a particular locale. * * The configure.ini has general information (at this point not really * being used) about all locales together with specific msg_id (identifiers * to be translated) and msg_string (translation) data. updateLocale takes * line data coming from the general.ini file, strings extracted from * documents that might need to be translation, the old configure.ini file * (this might have existing translations), as well as new translation * data that might come from a localizer via a web form and * combines these to produce a new configure.ini file * * @param array $general_ini data from the general.ini file * @param array $strings line array data extracted from files in * directories that have strings in need of translation * @param string $dir the directory of all the locales * @param string $locale the particular locale in $dir to update * @param array $new_configure translations of identifier strings from * another source such as a localizer using a web form * @param array $force_folders which locale subfolders should be forced * updated to the fallback dir's version */ function updateLocale($general_ini, $strings, $dir, $locale, $new_configure = NULL, $force_folders = array()) { $old_configure = array(); $cur_path = $dir . '/' . $locale; if (file_exists($cur_path . '/configure.ini')) { $old_configure = parse_ini_with_fallback($cur_path . '/configure.ini'); } $fallback_path = FALLBACK_LOCALE_DIR . '/' . $locale; $fallback_configure = array(); if (file_exists($fallback_path . '/configure.ini')) { $fallback_configure = parse_ini_with_fallback($fallback_path . '/configure.ini'); } if (file_exists($fallback_path . '/resources')) { if (in_array("resources", $force_folders)) { rename($cur_path . '/resources', $cur_path . '/resources' . time() . 'old'); } $this->updateLocaleSubFolder($cur_path . '/resources', $fallback_path . '/resources', array("js", "php", "ftr", "txt.gz")); } $n = array(); $n[] = <<<EOT ; ***** BEGIN LICENSE BLOCK ***** ; SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer ; Copyright (C) 2009 - 2014 Chris Pollett chris@pollett.org ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see <http://www.gnu.org/licenses/>. ; ***** END LICENSE BLOCK ***** ; ; configure.ini ; ; {$locale} configuration file ; EOT; foreach ($general_ini as $general_name => $general_value) { if (is_array($general_value)) { $n[] = "[{$general_name}]"; $old_configure[$general_name] = isset($old_configure[$general_name]) ? $old_configure[$general_name] : array(); $fallback_configure[$general_name] = isset($fallback_configure[$general_name]) ? $fallback_configure[$general_name] : array(); foreach ($general_value as $name => $value) { $n[] = $this->updateTranslation($new_configure[$general_name], $old_configure[$general_name], $fallback_configure[$general_name], $name, $value); } } else { $n[] = $this->updateTranslation($new_configure, $old_configure, $fallback_configure, $general_name); } } $n[] = ";\n; Strings to translate on various pages\n;"; $n[] = "[strings]"; foreach ($strings as $string) { if (isset($string[0]) && $string[0] == ";") { $n[] = $string; } else { $old_configure['strings'] = isset($old_configure['strings']) ? $old_configure['strings'] : array(); $fallback_configure['strings'] = isset($fallback_configure['strings']) ? $fallback_configure['strings'] : array(); $n[] = $this->updateTranslation($new_configure['strings'], $old_configure['strings'], $fallback_configure['strings'], $string); } } $out = implode("\n", $n); $out .= "\n"; file_put_contents($cur_path . '/configure.ini', $out); }