/**
  * Creates an text archive iterator with the given parameters.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over. If this
  *     iterator is used in a fetcher and the data is on a name server
  *     set this to false
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  * @param array $ini describes start_ and end_delimiter, file_extension,
  *     encoding, and compression method used for pages in this archive
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir, $ini = array())
 {
     $this->iterate_timestamp = $iterate_timestamp;
     $this->iterate_dir = $iterate_dir;
     $this->result_timestamp = $result_timestamp;
     $this->result_dir = $result_dir;
     if (!file_exists($result_dir)) {
         mkdir($result_dir);
     }
     $this->partitions = array();
     if ($this->iterate_dir != false) {
         // false =network/fetcher iterator
         if ($ini == array()) {
             $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini");
         }
         $extension = $ini['file_extension'];
     }
     $this->setIniInfo($ini);
     if ($this->start_delimiter == "" && $this->end_delimiter == "" && $this->iterate_dir != false) {
         crawlLog("At least one of start or end delimiter must be set!!");
         exit;
     }
     if ($this->iterate_dir != false) {
         foreach (glob("{$this->iterate_dir}/*.{$extension}", GLOB_BRACE) as $filename) {
             $this->partitions[] = $filename;
         }
     }
     $this->num_partitions = count($this->partitions);
     $this->status_filename = "{$this->result_dir}/iterate_status.txt";
     $this->buffer_filename = $this->result_dir . "/buffer.txt";
     if (file_exists($this->status_filename)) {
         $this->restoreCheckpoint();
     } else {
         $this->reset();
     }
 }
Esempio n. 2
0
 /**
  * Creates an database archive iterator with the given parameters. This
  * kind of iterator is used to cycle through the results of a SQL query
  * to a database, so that the results might be indexed by Yioop.
  *
  * @param string $iterate_timestamp timestamp of the arc archive bundle to
  *     iterate  over the pages of
  * @param string $iterate_dir folder of files to iterate over
  * @param string $result_timestamp timestamp of the arc archive bundle
  *     results are being stored in
  * @param string $result_dir where to write last position checkpoints to
  */
 function __construct($iterate_timestamp, $iterate_dir, $result_timestamp, $result_dir)
 {
     $this->iterate_timestamp = $iterate_timestamp;
     $this->iterate_dir = $iterate_dir;
     $this->result_timestamp = $result_timestamp;
     $this->result_dir = $result_dir;
     $ini = parse_ini_with_fallback("{$this->iterate_dir}/arc_description.ini");
     $this->dbinfo = array("DBMS" => DBMS, "DB_HOST" => DB_HOST, "DB_NAME" => DB_NAME, "DB_USER" => DB_USER, "DB_PASSWORD" => DB_PASSWORD);
     foreach ($this->dbinfo as $key => $value) {
         $ini_key = strtolower($key);
         if (isset($ini[$ini_key])) {
             $this->dbinfo[$key] = $ini[$ini_key];
         }
     }
     $db_class = ucfirst($this->dbinfo["DBMS"]) . "Manager";
     $this->db = new $db_class();
     $this->db->connect($this->dbinfo['DB_HOST'], $this->dbinfo['DB_USER'], $this->dbinfo['DB_PASSWORD'], $this->dbinfo['DB_NAME']);
     if (isset($ini['sql'])) {
         $this->sql = $ini['sql'];
     } else {
         crawlLog("Database Archive Iterator needs a SQL statement to run");
         exit;
     }
     if (isset($ini['field_value_separator'])) {
         $this->field_value_separator = $ini['field_value_separator'];
     } else {
         $this->field_value_separator = "\n----\n";
     }
     if (isset($ini['column_separator'])) {
         $this->column_separator = $ini['column_separator'];
     } else {
         $this->column_separator = "\n====\n";
     }
     if (isset($ini['encoding'])) {
         $this->encoding = $ini['encoding'];
     } else {
         $this->encoding = "UTF-8";
     }
     if (!file_exists($result_dir)) {
         mkdir($result_dir);
     }
     if (file_exists("{$this->result_dir}/iterate_status.txt")) {
         $this->restoreCheckpoint();
     } else {
         $this->reset();
     }
 }
Esempio n. 3
0
 /**
  * Gets a list of all index archives of crawls that have been conducted
  *
  * @param bool $return_arc_bundles whether index bundles used for indexing
  *     arc or other archive bundles should be included in the lsit
  * @param bool $return_recrawls whether index archive bundles generated as
  *     a result of recrawling should be included in the result
  * @param array $machine_urls an array of urls of yioop queue servers
  * @param bool $cache whether to try to get/set the data to a cache file
  *
  * @return array available IndexArchiveBundle directories and
  *     their meta information this meta information includes the time of
  *     the crawl, its description, the number of pages downloaded, and the
  *     number of partitions used in storing the inverted index
  */
 function getCrawlList($return_arc_bundles = false, $return_recrawls = false, $machine_urls = NULL, $cache = false)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $arg = $return_arc_bundles && $return_recrawls ? 3 : ($return_recrawls ? 2 : ($return_arc_bundles ? 1 : 0));
         $cache_file = CRAWL_DIR . "/cache/" . self::network_crawllist_base_name . "{$arg}.txt";
         if ($cache && file_exists($cache_file) && filemtime($cache_file) + 300 > time()) {
             return unserialize(file_get_contents($cache_file));
         }
         $list_strings = $this->execMachines("getCrawlList", $machine_urls, $arg);
         $list = $this->aggregateCrawlList($list_strings);
         if ($cache) {
             file_put_contents($cache_file, serialize($list));
         }
         return $list;
     }
     $list = array();
     $dirs = glob(CRAWL_DIR . '/cache/' . self::index_data_base_name . '*', GLOB_ONLYDIR);
     foreach ($dirs as $dir) {
         $crawl = array();
         $pre_timestamp = strstr($dir, self::index_data_base_name);
         $crawl['CRAWL_TIME'] = substr($pre_timestamp, strlen(self::index_data_base_name));
         $info = IndexArchiveBundle::getArchiveInfo($dir);
         $index_info = @unserialize($info['DESCRIPTION']);
         $crawl['DESCRIPTION'] = "";
         if (!$return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
             continue;
         } else {
             if ($return_recrawls && isset($index_info[self::CRAWL_TYPE]) && $index_info[self::CRAWL_TYPE] == self::ARCHIVE_CRAWL) {
                 $crawl['DESCRIPTION'] = "RECRAWL::";
             }
         }
         $sched_path = CRAWL_DIR . '/schedules/' . self::schedule_data_base_name . $crawl['CRAWL_TIME'];
         $crawl['RESUMABLE'] = false;
         if (is_dir($sched_path)) {
             $sched_dir = opendir($sched_path);
             while (($name = readdir($sched_dir)) !== false) {
                 $sub_path = "{$sched_path}/{$name}";
                 if (!is_dir($sub_path) || $name == '.' || $name == '..') {
                     continue;
                 }
                 $sub_dir = opendir($sub_path);
                 $i = 0;
                 while (($sub_name = readdir($sub_dir)) !== false && $i < 5) {
                     if ($sub_name[0] == 'A' && $sub_name[1] == 't') {
                         $crawl['RESUMABLE'] = true;
                         break 2;
                     }
                 }
                 closedir($sub_dir);
             }
             closedir($sched_dir);
         }
         if (isset($index_info['DESCRIPTION'])) {
             $crawl['DESCRIPTION'] .= $index_info['DESCRIPTION'];
         }
         $crawl['VISITED_URLS_COUNT'] = isset($info['VISITED_URLS_COUNT']) ? $info['VISITED_URLS_COUNT'] : 0;
         $crawl['COUNT'] = isset($info['COUNT']) ? $info['COUNT'] : 0;
         $crawl['NUM_DOCS_PER_PARTITION'] = isset($info['NUM_DOCS_PER_PARTITION']) ? $info['NUM_DOCS_PER_PARTITION'] : 0;
         $crawl['WRITE_PARTITION'] = isset($info['WRITE_PARTITION']) ? $info['WRITE_PARTITION'] : 0;
         $list[] = $crawl;
     }
     if ($return_arc_bundles) {
         $dirs = glob(CRAWL_DIR . '/archives/*', GLOB_ONLYDIR);
         foreach ($dirs as $dir) {
             $crawl = array();
             $crawl['CRAWL_TIME'] = crc32($dir);
             $crawl['DESCRIPTION'] = "ARCFILE::";
             $crawl['ARC_DIR'] = $dir;
             $ini_file = "{$dir}/arc_description.ini";
             if (!file_exists($ini_file)) {
                 continue;
             } else {
                 $ini = parse_ini_with_fallback($ini_file);
                 $crawl['ARC_TYPE'] = $ini['arc_type'];
                 $crawl['DESCRIPTION'] .= $ini['description'];
             }
             $crawl['VISITED_URLS_COUNT'] = 0;
             $crawl['COUNT'] = 0;
             $crawl['NUM_DOCS_PER_PARTITION'] = 0;
             $crawl['WRITE_PARTITION'] = 0;
             $list[] = $crawl;
         }
     }
     return $list;
 }
Esempio n. 4
0
 /**
  * Given a folder name, determines the kind of bundle (if any) it holds.
  * It does this based on the expected location of the description.txt file,
  * or arc_description.ini (in the case of a non-yioop archive)
  *
  * @param string $archive_path the path to archive folder
  * @return string the archive bundle type, either: WebArchiveBundle or
  *     IndexArchiveBundle
  */
 function getArchiveKind($archive_path)
 {
     if (file_exists("{$archive_path}/description.txt")) {
         return "WebArchiveBundle";
     }
     if (file_exists("{$archive_path}/summaries/description.txt")) {
         return "IndexArchiveBundle";
     }
     $desc_path = "{$archive_path}/arc_description.ini";
     if (file_exists($desc_path)) {
         $desc = parse_ini_with_fallback($desc_path);
         if (!isset($desc['arc_type'])) {
             return false;
         }
         return $desc['arc_type'];
     }
     return false;
 }
Esempio n. 5
0
    /**
     * Updates the configure.ini file and static pages for a particular locale.
     *
     * The configure.ini has general information (at this point not really
     * being used) about all locales together with specific msg_id (identifiers
     * to be translated) and msg_string (translation) data. updateLocale takes
     * line data coming from the general.ini file, strings extracted from
     * documents that might need to be translation, the old configure.ini file
     * (this might have existing translations),  as well as new translation
     * data that might come from a localizer via a web form and
     * combines these to produce a new configure.ini file
     *
     * @param array $general_ini data from the general.ini file
     * @param array $strings line array data extracted from files in
     *     directories that have strings in need of translation
     * @param string $dir the directory of all the locales
     * @param string $locale the particular locale in $dir to update
     * @param array $new_configure translations of identifier strings from
     *     another source such as a localizer using a web form
     * @param array $force_folders which locale subfolders should be forced
     *     updated to the fallback dir's version
     */
    function updateLocale($general_ini, $strings, $dir, $locale, $new_configure = NULL, $force_folders = array())
    {
        $old_configure = array();
        $cur_path = $dir . '/' . $locale;
        if (file_exists($cur_path . '/configure.ini')) {
            $old_configure = parse_ini_with_fallback($cur_path . '/configure.ini');
        }
        $fallback_path = FALLBACK_LOCALE_DIR . '/' . $locale;
        $fallback_configure = array();
        if (file_exists($fallback_path . '/configure.ini')) {
            $fallback_configure = parse_ini_with_fallback($fallback_path . '/configure.ini');
        }
        if (file_exists($fallback_path . '/resources')) {
            if (in_array("resources", $force_folders)) {
                rename($cur_path . '/resources', $cur_path . '/resources' . time() . 'old');
            }
            $this->updateLocaleSubFolder($cur_path . '/resources', $fallback_path . '/resources', array("js", "php", "ftr", "txt.gz"));
        }
        $n = array();
        $n[] = <<<EOT
; ***** BEGIN LICENSE BLOCK *****
;  SeekQuarry/Yioop Open Source Pure PHP Search Engine, Crawler, and Indexer
;  Copyright (C) 2009 - 2014  Chris Pollett chris@pollett.org
;
;  This program is free software: you can redistribute it and/or modify
;  it under the terms of the GNU General Public License as published by
;  the Free Software Foundation, either version 3 of the License, or
;  (at your option) any later version.
;
;  This program is distributed in the hope that it will be useful,
;  but WITHOUT ANY WARRANTY; without even the implied warranty of
;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;  GNU General Public License for more details.
;
;  You should have received a copy of the GNU General Public License
;  along with this program.  If not, see <http://www.gnu.org/licenses/>.
;  ***** END LICENSE BLOCK *****
;
; configure.ini
;
; {$locale} configuration file
;
EOT;
        foreach ($general_ini as $general_name => $general_value) {
            if (is_array($general_value)) {
                $n[] = "[{$general_name}]";
                $old_configure[$general_name] = isset($old_configure[$general_name]) ? $old_configure[$general_name] : array();
                $fallback_configure[$general_name] = isset($fallback_configure[$general_name]) ? $fallback_configure[$general_name] : array();
                foreach ($general_value as $name => $value) {
                    $n[] = $this->updateTranslation($new_configure[$general_name], $old_configure[$general_name], $fallback_configure[$general_name], $name, $value);
                }
            } else {
                $n[] = $this->updateTranslation($new_configure, $old_configure, $fallback_configure, $general_name);
            }
        }
        $n[] = ";\n; Strings to translate on various pages\n;";
        $n[] = "[strings]";
        foreach ($strings as $string) {
            if (isset($string[0]) && $string[0] == ";") {
                $n[] = $string;
            } else {
                $old_configure['strings'] = isset($old_configure['strings']) ? $old_configure['strings'] : array();
                $fallback_configure['strings'] = isset($fallback_configure['strings']) ? $fallback_configure['strings'] : array();
                $n[] = $this->updateTranslation($new_configure['strings'], $old_configure['strings'], $fallback_configure['strings'], $string);
            }
        }
        $out = implode("\n", $n);
        $out .= "\n";
        file_put_contents($cur_path . '/configure.ini', $out);
    }