Exemple #1
0
 /**
  * Used to create encode a string representing with meta info for
  * a fetcher schedule.
  *
  * @param int $schedule_time timestamp of the schedule
  * @return string base64 encoded meta info
  */
 function calculateScheduleMetaInfo($schedule_time)
 {
     //notice does not contain self::QUEUE_SERVERS
     $sites = array();
     $sites[self::CRAWL_TIME] = $this->crawl_time;
     $sites[self::SCHEDULE_TIME] = $schedule_time;
     $sites[self::CRAWL_ORDER] = $this->crawl_order;
     $sites[self::CRAWL_TYPE] = $this->crawl_type;
     $sites[self::CRAWL_INDEX] = $this->crawl_index;
     $sites[self::CACHE_PAGES] = $this->cache_pages;
     $sites[self::PAGE_RULES] = $this->page_rules;
     $sites[self::RESTRICT_SITES_BY_URL] = $this->restrict_sites_by_url;
     $sites[self::INDEXED_FILE_TYPES] = $this->indexed_file_types;
     $sites[self::ALLOWED_SITES] = $this->allowed_sites;
     $sites[self::DISALLOWED_SITES] = $this->disallowed_sites;
     $sites[self::INDEXING_PLUGINS] = $this->indexing_plugins;
     $sites[self::INDEXING_PLUGINS_DATA] = $this->indexing_plugins_data;
     $sites[self::VIDEO_SOURCES] = $this->video_sources;
     $sites[self::PAGE_RANGE_REQUEST] = $this->page_range_request;
     $sites[self::MAX_DESCRIPTION_LEN] = $this->max_description_len;
     $sites[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size"));
     $sites[self::SITES] = array();
     return base64_encode(serialize($sites)) . "\n";
 }
Exemple #2
0
 /**
  * Checks for the crawl time according either to crawl_status.txt or to
  * network_status.txt, and presents it to the requesting fetcher, along
  * with a list of available queue servers.
  */
 function crawlTime()
 {
     $info = array();
     $info[self::STATUS] = self::CONTINUE_STATE;
     $view = "fetch";
     $cron_model = $this->model("cron");
     if (isset($_REQUEST['crawl_time'])) {
         $prev_crawl_time = substr($this->clean($_REQUEST['crawl_time'], 'int'), 0, TIMESTAMP_LEN);
     } else {
         $prev_crawl_time = 0;
     }
     $cron_time = $cron_model->getCronTime("fetcher_restart");
     $delta = time() - $cron_time;
     if ($delta > self::CRON_INTERVAL) {
         $cron_model->updateCronTime("fetcher_restart");
         $this->doCronTasks();
     } else {
         if ($delta == 0) {
             $cron_model->updateCronTime("fetcher_restart");
         }
     }
     $local_filename = CRAWL_DIR . "/schedules/crawl_status.txt";
     $network_filename = CRAWL_DIR . "/schedules/network_status.txt";
     if (file_exists($local_filename)) {
         $crawl_status = unserialize(file_get_contents($local_filename));
         $crawl_time = isset($crawl_status["CRAWL_TIME"]) ? $crawl_status["CRAWL_TIME"] : 0;
     } else {
         if (file_exists($network_filename)) {
             $crawl_time = unserialize(file_get_contents($network_filename));
         } else {
             $crawl_time = 0;
         }
     }
     $info[self::CRAWL_TIME] = $crawl_time;
     $status_filename = CRAWL_DIR . "/schedules/name_server_messages.txt";
     if ($crawl_time != 0 && file_exists($status_filename)) {
         $status = unserialize(file_get_contents($status_filename));
         if ($status[self::STATUS] == 'STOP_CRAWL') {
             $info[self::STATUS] == 'STOP_CRAWL';
             $info[self::CRAWL_TIME] = 0;
         }
         if ($status[self::STATUS] != 'STOP_CRAWL' && $crawl_time != $prev_crawl_time) {
             $to_copy_fields = array(self::ALLOWED_SITES, self::ARC_DIR, self::ARC_TYPE, self::CRAWL_INDEX, self::CRAWL_TYPE, self::DISALLOWED_SITES, self::INDEXED_FILE_TYPES, self::PROXY_SERVERS, self::RESTRICT_SITES_BY_URL, self::SUMMARIZER_OPTION, self::TOR_PROXY);
             foreach ($to_copy_fields as $field) {
                 if (isset($status[$field])) {
                     $info[$field] = $status[$field];
                 }
             }
             /*
               When initiating a new crawl AND there are active
               classifiers (an array of class labels), then augment the
               info with compressed, serialized versions of each active
               classifier so that each fetcher can reconstruct the same
               classifiers.
             */
             $classifier_array = array();
             if (isset($status[self::ACTIVE_CLASSIFIERS])) {
                 $classifier_array = array_merge($status[self::ACTIVE_CLASSIFIERS]);
                 $info[self::ACTIVE_CLASSIFIERS] = $status[self::ACTIVE_CLASSIFIERS];
             }
             if (isset($status[self::ACTIVE_RANKERS])) {
                 $classifier_array = array_merge($classifier_array, $status[self::ACTIVE_RANKERS]);
                 $info[self::ACTIVE_RANKERS] = $status[self::ACTIVE_RANKERS];
             }
             if ($classifier_array != array()) {
                 $classifiers_data = Classifier::loadClassifiersData($classifier_array);
                 $info[self::ACTIVE_CLASSIFIERS_DATA] = $classifiers_data;
             }
         }
     }
     $info[self::QUEUE_SERVERS] = $this->model("machine")->getQueueServerUrls();
     $info[self::SAVED_CRAWL_TIMES] = $this->getCrawlTimes();
     $info[self::POST_MAX_SIZE] = metricToInt(ini_get("post_max_size"));
     if (count($info[self::QUEUE_SERVERS]) == 0) {
         $info[self::QUEUE_SERVERS] = array(NAME_SERVER);
     }
     $data = array();
     $data['MESSAGE'] = serialize($info);
     $this->displayView($view, $data);
 }
Exemple #3
0
 /**
  * Determines based on its size, if index_shard should be added to
  * the active generation or in a new generation should be started.
  * If so, a new generation is started, the old generation is saved, and
  * the dictionary of the old shard is copied to the bundles dictionary
  * and a log-merge performed if needed
  *
  * @param int $add_num_docs number of docs in the shard about to be added
  * @param object $callback object with join function to be
  *     called if process is taking too long
  * @param bool $blocking whether there is an ongoing merge tiers operation
  *      occurring, if so don't do anything and return -1
  * @return int the active generation after the check and possible change has
  *     been performed
  */
 function initGenerationToAdd($add_num_docs, $callback = NULL, $blocking = false)
 {
     $current_num_docs = $this->getActiveShard()->num_docs;
     crawlLog("Current index shard has " . $current_num_docs . " documents.");
     $memory_limit = metricToInt(ini_get("memory_limit"));
     crawlLog("Memory Indexer limit is " . $memory_limit . ". Usage is " . memory_get_usage());
     if ($current_num_docs + $add_num_docs > $this->num_docs_per_generation || 0.65 * $memory_limit < memory_get_usage()) {
         if ($blocking == true) {
             return -1;
         }
         crawlLog("Switching Index Shard...");
         $switch_time = microtime();
         // Save current shard dictionary to main dictionary
         $this->forceSave();
         $this->addAdvanceGeneration($callback);
         crawlLog("Switch Index Shard time:" . changeInMicrotime($switch_time));
     }
     return $this->generation_info['ACTIVE'];
 }
Exemple #4
0
 /**
  * Function to check if memory for this fetcher instance is getting low
  * relative to what the system will allow.
  *
  * @return bool whether available memory is getting low
  */
 function exceedMemoryThreshold()
 {
     return memory_get_usage() > metricToInt(ini_get("memory_limit")) * 0.7;
 }
Exemple #5
0
    /**
     * Used to drawn the form that let's someone edit a wiki page
     *
     * @param array $data fields contain data about the page being
     * edited. In particular, PAGE contains the raw page data
     */
    function renderEditPageForm($data)
    {
        $base_url = "?c=" . $data['CONTROLLER'] . "&amp;a=wiki&amp;" . CSRF_TOKEN . '=' . $data[CSRF_TOKEN] . "&amp;group_id=" . $data['GROUP']['GROUP_ID'];
        $simple_base_url = str_replace("&amp;", "&", $base_url);
        $append = "";
        if (isset($data['OTHER_BACK_URL'])) {
            $append = $data['OTHER_BACK_URL'];
        }
        ?>
        <div class="float-opposite" style="position:relative; top:35px;">
        [<a href="<?php 
        e($base_url . $append);
        ?>
&amp;<?php 
        e('&amp;arg=history&amp;page_id=' . $data['PAGE_ID']);
        ?>
"
        ><?php 
        e(tl('wiki_element_history'));
        ?>
</a>]
        [<a href="?c=<?php 
        e($data['CONTROLLER']);
        ?>
&amp;a=groupFeeds&amp;<?php 
        e(CSRF_TOKEN . '=' . $data[CSRF_TOKEN] . '&amp;just_thread=' . $data['DISCUSS_THREAD']);
        ?>
" ><?php 
        e(tl('wiki_element_discuss'));
        ?>
</a>]
        </div>
        <form id="editpageForm" method="post"
            onsubmit="elt('caret-pos').value =
            (elt('wiki-page').selectionStart) ?
            elt('wiki-page').selectionStart : 0;
            elt('scroll-top').value= (elt('wiki-page').scrollTop) ?
            elt('wiki-page').scrollTop : 0;" >
            <input type="hidden" name="c" value="<?php 
        e($data['CONTROLLER']);
        ?>
" />
            <input type="hidden" name="<?php 
        e(CSRF_TOKEN);
        ?>
" value="<?php 
        e($data[CSRF_TOKEN]);
        ?>
" />
            <input type="hidden" name="a" value="wiki" />
            <input type="hidden" name="arg" value="edit" />
            <?php 
        if (isset($data['BACK_PARAMS'])) {
            foreach ($data["BACK_PARAMS"] as $back_param_key => $back_param_value) {
                e('<input type="hidden" ' . 'name="' . $back_param_key . '" value="' . $back_param_value . '" />');
            }
        }
        ?>
            <input type="hidden" name="group_id" value="<?php 
        e($data['GROUP']['GROUP_ID']);
        ?>
" />
            <input type="hidden" name="page_name" value="<?php 
        e($data['PAGE_NAME']);
        ?>
" />
            <input type="hidden" name="caret" id="caret-pos"/>
            <input type="hidden" name="scroll_top" id="scroll-top"/>
            <input type="hidden" id="p-settings" name="settings" value="<?php 
        e($data['settings']);
        ?>
"/>
            <div class="top-margin">
                <b><?php 
        e(tl('wiki_element_locale_name', $data['CURRENT_LOCALE_TAG']));
        ?>
</b><br />
                <label for="page-data"><b><?php 
        $human_page_name = str_replace("_", " ", $data['PAGE_NAME']);
        e(tl('wiki_element_page', $human_page_name));
        ?>
</b></label> <span id="toggle-settings"
                >[<a href="javascript:toggleSettings()"><?php 
        e(tl('configure_element_toggle_page_settings'));
        ?>
</a>]</span>
            </div>
            <div id='page-settings'>
            <div class="top-margin">
            <label for="page-type"><b><?php 
        e(tl('wiki_element_page_type'));
        ?>
</b></label><?php 
        $this->view->helper("options")->render("page-type", "page_type", $data['page_types'], $data['current_page_type']);
        ?>
            </div>
            <div id='alias-type'>
            <div class="top-margin">
            <label for="page-alias"><b><?php 
        e(tl('wiki_element_page_alias'));
        ?>
</b></label><input type="text" id='page-alias'
                name="page_alias" value="<?php 
        e($data['page_alias']);
        ?>
"
                maxlength="<?php 
        e(SHORT_TITLE_LEN);
        ?>
" class="wide-field"/>
            </div>
            </div>
            <div id='non-alias-type'>
            <div class="top-margin">
            <label for="page-border"><b><?php 
        e(tl('wiki_element_page_border'));
        ?>
</b></label><?php 
        $this->view->helper("options")->render("page-border", "page_border", $data['page_borders'], $data['page_border']);
        ?>
            </div>
            <div class="top-margin">
            <label for="page-toc"><b><?php 
        e(tl('wiki_element_table_of_contents'));
        ?>
</b></label><input type="checkbox" name="toc" value="true"
                <?php 
        $checked = isset($data['toc']) && $data['toc'] ? 'checked="checked"' : '';
        e($checked);
        ?>
 id='page-toc' />
            </div>
            <div class="top-margin">
            <label for="page-title"><b><?php 
        e(tl('wiki_element_title'));
        ?>
</b></label><input type="text" id='page-title'
                name="title" value="<?php 
        e($data['title']);
        ?>
"
                maxlength="<?php 
        e(SHORT_TITLE_LEN);
        ?>
" class="wide-field"/>
            </div>
            <div class="top-margin">
            <label for="meta-author"><b><?php 
        e(tl('wiki_element_meta_author'));
        ?>
</b></label><input type="text" id='meta-author'
                name="author" value="<?php 
        e($data['author']);
        ?>
"
                maxlength="<?php 
        e(LONG_NAME_LEN);
        ?>
" class="wide-field"/>
            </div>
            <div class="top-margin">
            <label for="meta-robots"><b><?php 
        e(tl('wiki_element_meta_robots'));
        ?>
</b></label><input type="text" id='meta-robots'
                name="robots" value="<?php 
        e($data['robots']);
        ?>
"
                maxlength="<?php 
        e(LONG_NAME_LEN);
        ?>
" class="wide-field"/>
            </div>
            <div class="top-margin">
            <label for="meta-description"><b><?php 
        e(tl('wiki_element_meta_description'));
        ?>
</b></label>
            </div>
            <textarea id="meta-description" class="short-text-area"
                name="description" data-buttons='none'><?php 
        e($data['description']);
        ?>
</textarea>
            <div class="top-margin">
            <label for="page-header"><b><?php 
        e(tl('wiki_element_page_header'));
        ?>
</b></label><input type="text" id='page-header'
                name="page_header" value="<?php 
        e($data['page_header']);
        ?>
"
                maxlength="<?php 
        e(SHORT_TITLE_LEN);
        ?>
" class="wide-field"/>
            </div>
            <div class="top-margin">
            <label for="page-footer"><b><?php 
        e(tl('wiki_element_page_footer'));
        ?>
</b></label><input type="text" id='page-footer'
                name="page_footer" value="<?php 
        e($data['page_footer']);
        ?>
"
                maxlength="<?php 
        e(SHORT_TITLE_LEN);
        ?>
" class="wide-field"/>
            </div>
            </div>
            </div>
            <div id='page-container'><textarea id="wiki-page"
                class="tall-text-area" name="page"
                <?php 
        if (!isset($data['page_type']) || $data['page_type'] != 'presentation') {
            $data_buttons = 'all,!wikibtn-slide';
        } else {
            $data_buttons = 'all';
        }
        ?>
                data-buttons='<?php 
        e($data_buttons);
        ?>
' ><?php 
        e($data['PAGE']);
        ?>
</textarea>
            <div class="green"><?php 
        e(tl('wiki_element_archive_info'));
        ?>
</div>
            <div class="top-margin">
            <label for="edit-reason"><b><?php 
        e(tl('wiki_element_edit_reason'));
        ?>
</b></label><input type="text" id='edit-reason' name="edit_reason"
                  value="" maxlength="<?php 
        e(SHORT_TITLE_LEN);
        ?>
"
                  class="wide-field"/></div>
            </div>
            <div id="save-container" class="top-margin center">
            <button class="button-box" type="submit"><?php 
        e(tl('wiki_element_savebutton'));
        ?>
</button>
            </div>
        </form>
        <div class="top-margin" id="media-list-page">
        <h2><?php 
        e(tl('wiki_element_media_list'));
        ?>
</h2>
        <p><?php 
        e(tl('wiki_element_ml_description'));
        ?>
</p>
        </div>
        <form id="resourceUploadForm" method="post"
            enctype="multipart/form-data">
        <input type="hidden" name="c" value="<?php 
        e($data['CONTROLLER']);
        ?>
" />
        <input type="hidden" name="<?php 
        e(CSRF_TOKEN);
        ?>
" value="<?php 
        e($data[CSRF_TOKEN]);
        ?>
" />
        <input type="hidden" name="a" value="wiki" />
        <input type="hidden" name="arg" value="edit" />
        <?php 
        if (isset($data['BACK_PARAMS'])) {
            foreach ($data["BACK_PARAMS"] as $back_param_key => $back_param_value) {
                e('<input type="hidden" ' . 'name="' . $back_param_key . '" value="' . $back_param_value . '" />');
            }
        }
        ?>
        <input type="hidden" name="group_id" value="<?php 
        e($data['GROUP']['GROUP_ID']);
        ?>
" />
        <input type="hidden" name="page_name" value="<?php 
        e($data['PAGE_NAME']);
        ?>
" />
        <input type="hidden" name="settings" value="<?php 
        e($data['settings']);
        ?>
" />
        <div id="page-resources">
        <h3><?php 
        e(tl('wiki_view_page_resources'));
        ?>
</h3>
        <p><?php 
        e(tl('wiki_view_resources_info'));
        ?>
</p>
        <input type="file" class="slight-pad wide-field"
            id='page-resource' name='page_resource' />
            <button class="button-box" type="submit"><?php 
        e(tl('wiki_view_upload'));
        ?>
</button></div>
        </form>
        <h3 id="progress-bar" class="red indent"></h3>
        <?php 
        $this->renderResources($data, false);
        ?>
        <script type="text/javascript">
        function addToPage(resource_name)
        {
            wikify("((resource:","|<?php 
        e(tl('wiki_element_resource_description'));
        ?>
))", resource_name, "wiki-page");
        }
        function checkUploadResource()
        {
            var max_resource_size = <?php 
        e(metricToInt(ini_get('upload_max_filesize')));
        ?>
;
            var page_resource = elt('page-resource').files[0];
            if(page_resource.size > max_resource_size) {
                doMessage('<h1 class=\"red\" ><?php 
        e(tl("wiki_element_file_too_big", metricToInt(ini_get('upload_max_filesize'))));
        ?>
</h1>');
                return false;
            }
            return true;
        }
        function renameResource(old_name, id)
        {
            var name_elt = elt("resource-"+id);
            var new_name = "";
            if(name_elt) {
                new_name = name_elt.value;
            }
            if(!name_elt || !new_name) {
                doMessage('<h1 class=\"red\" ><?php 
        e(tl("wiki_element_rename_failed"));
        ?>
</h1>');
                return;
            }
            var location = "<?php 
        e("{$simple_base_url}&arg=edit&page_name=" . $data['PAGE_NAME']);
        ?>
" + "&new_resource_name=" + new_name +
                "&old_resource_name=" + old_name;
            window.location = location;
        }
        function uploadForm(event)
        {
            if(!checkUploadResource()) {
                event.preventDefault();
                return;
            }
            var resource_form = elt('resourceUploadForm');
            var form_data = new FormData(resource_form);
            var request = new XMLHttpRequest();
            request.upload.addEventListener("progress", uploadProgress, false);
            request.addEventListener("load", uploadComplete, false);
            request.addEventListener("error", uploadFailed, false);
            request.addEventListener("abort", uploadCanceled, false);
            request.open("post", "./");
            request.send(form_data);
            event.preventDefault();
        }
        function uploadProgress(event)
        {
            var progress = elt('progress-bar');
            if(event.lengthComputable) {
                var percent_complete =
                    Math.round(event.loaded * 100 / event.total);
                progress.innerHTML = '<?php 
        e(tl('wiki_element_upload_progress'));
        ?>
' +
                    percent_complete.toString() + '%';
            } else {
                progress.innerHTML = '<?php 
        e(tl("wiki_element_progress_meter_disabled"));
        ?>
';
            }
        }
        function uploadComplete(event)
        {
            /* This event is raised when the server send back a response */
            document.open();
            document.write(event.target.responseText);
            document.close();
        }

        function uploadFailed(event)
        {
            doMessage('<h1 class=\"red\" ><?php 
        e(tl("wiki_element_upload_error"));
        ?>
</h1>');
        }

        function uploadCanceled(event)
        {
            doMessage('<h1 class=\"red\" ><?php 
        e(tl("wiki_element_upload_cancelled"));
        ?>
</h1>');
        }
        var resource_form = document.getElementById('resourceUploadForm');
        resource_form.addEventListener('submit', uploadForm);
        </script>
        <?php 
    }
Exemple #6
0
 /**
  * Make multi_curl requests for an array of sites with urls or onion urls
  *
  * @param array $sites  an array containing urls of pages to request
  * @param bool $timer  flag, true means print timing statistics to log
  * @param int $page_range_request maximum number of bytes to download/page
  *     0 means download all
  * @param string $temp_dir folder to store temporary ip header info
  * @param string $key  the component of $sites[$i] that has the value of
  *     a url to get defaults to URL
  * @param string $value component of $sites[$i] in which to store the
  *     page that was gotten
  * @param bool $minimal if true do a faster request of pages by not
  *     doing things like extract HTTP headers sent, etcs
  * @param array $post_data data to be POST'd to each site
  * @param bool $follow whether to follow redirects or not
  * @param string $tor_proxy url of a proxy that knows how to download
  *     .onion urls
  * @param array $proxy_servers if not array(), then an array of proxy
  *     server to use rather than to directly download web pages from
  *     the current machine
  *
  * @return array an updated array with the contents of those pages
  */
 static function getPages($sites, $timer = false, $page_range_request = PAGE_RANGE_REQUEST, $temp_dir = NULL, $key = CrawlConstants::URL, $value = CrawlConstants::PAGE, $minimal = false, $post_data = NULL, $follow = false, $tor_proxy = "", $proxy_servers = array())
 {
     $agent_handler = curl_multi_init();
     $active = NULL;
     $start_time = microtime();
     if (!$minimal && $temp_dir == NULL) {
         $temp_dir = CRAWL_DIR . "/temp";
         if (!file_exists($temp_dir)) {
             mkdir($temp_dir);
         }
     }
     //Set-up requests
     $num_sites = count($sites);
     for ($i = 0; $i < $num_sites; $i++) {
         $is_gopher = false;
         $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
         if (isset($sites[$i][$key])) {
             list($sites[$i][$key], $url, $headers) = self::prepareUrlHeaders($sites[$i][$key], $minimal, $proxy_servers);
             if ($headers == "gopher") {
                 $is_gopher = true;
                 $sites[$i][CrawlConstants::IS_GOPHER_URL] = $is_gopher;
                 $headers = array();
             }
             $sites[$i][0] = curl_init();
             if (!$minimal) {
                 $ip_holder[$i] = fopen("{$temp_dir}/tmp{$i}.txt", 'w+');
                 curl_setopt($sites[$i][0], CURLOPT_STDERR, $ip_holder[$i]);
                 curl_setopt($sites[$i][0], CURLOPT_VERBOSE, true);
             }
             curl_setopt($sites[$i][0], CURLOPT_USERAGENT, USER_AGENT);
             curl_setopt($sites[$i][0], CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
             curl_setopt($sites[$i][0], CURLOPT_URL, $url);
             if (strcmp(substr($url, -10), "robots.txt") == 0) {
                 $sites[$i]['ROBOT'] = true;
                 $follow = true;
                 /*wikipedia redirects their robot page. grr
                     want to force this for robots pages
                   */
             }
             curl_setopt($sites[$i][0], CURLOPT_FOLLOWLOCATION, $follow);
             curl_setopt($sites[$i][0], CURLOPT_SSL_VERIFYHOST, 0);
             curl_setopt($sites[$i][0], CURLOPT_AUTOREFERER, true);
             curl_setopt($sites[$i][0], CURLOPT_RETURNTRANSFER, true);
             curl_setopt($sites[$i][0], CURLOPT_CONNECTTIMEOUT, PAGE_TIMEOUT);
             curl_setopt($sites[$i][0], CURLOPT_TIMEOUT, PAGE_TIMEOUT);
             if (stripos($url, '.onion') !== false && $tor_proxy != "") {
                 curl_setopt($sites[$i][0], CURLOPT_PROXY, $tor_proxy);
                 //CURLPROXY_SOCKS5_HOSTNAME = 7
                 curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, 7);
                 if ($timer) {
                     crawlLog("Using Tor proxy for {$url}..");
                 }
             } else {
                 if ($proxy_servers != array() && !$is_gopher) {
                     $select_proxy = rand(0, count($proxy_servers) - 1);
                     $proxy_server = $proxy_servers[$select_proxy];
                     $proxy_parts = explode(":", $proxy_server);
                     $proxy_ip = $proxy_parts[0];
                     if (!isset($proxy_parts[2]) || strtolower($proxy_parts[2]) == 'http') {
                         $proxy_type = CURLPROXY_HTTP;
                     } else {
                         if (strtolower($proxy_parts[2]) == 'socks5') {
                             $proxy_type = CURLPROXY_SOCKS5;
                         } else {
                             $proxy_type = $proxy_parts[2];
                         }
                     }
                     if (isset($proxy_parts[1])) {
                         $proxy_port = $proxy_parts[1];
                     } else {
                         $proxy_port = "80";
                     }
                     curl_setopt($sites[$i][0], CURLOPT_PROXY, "{$proxy_ip}:{$proxy_port}");
                     curl_setopt($sites[$i][0], CURLOPT_PROXYTYPE, $proxy_type);
                     if ($timer) {
                         crawlLog("Selecting proxy {$select_proxy} for {$url}");
                     }
                 }
             }
             if (!$minimal) {
                 curl_setopt($sites[$i][0], CURLOPT_HEADER, true);
             }
             //make lighttpd happier
             if (!$is_gopher) {
                 curl_setopt($sites[$i][0], CURLOPT_HTTPHEADER, $headers);
             }
             curl_setopt($sites[$i][0], CURLOPT_ENCODING, "");
             // ^ need to set for sites like att that use gzip
             if ($page_range_request > 0) {
                 curl_setopt($sites[$i][0], CURLOPT_RANGE, "0-" . $page_range_request);
             }
             if ($post_data != NULL) {
                 curl_setopt($sites[$i][0], CURLOPT_POST, true);
                 curl_setopt($sites[$i][0], CURLOPT_POSTFIELDS, $post_data[$i]);
             }
             curl_multi_add_handle($agent_handler, $sites[$i][0]);
         }
     }
     if ($timer) {
         crawlLog("  Init Get Pages " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     $start = time();
     //Wait for responses
     $running = NULL;
     $memory_limit = metricToInt(ini_get("memory_limit")) * 0.7;
     do {
         $mrc = curl_multi_exec($agent_handler, $running);
         $ready = curl_multi_select($agent_handler, 0.005);
     } while (memory_get_usage() < $memory_limit && time() - $start < PAGE_TIMEOUT && $running > 0);
     if (time() - $start > PAGE_TIMEOUT && $timer) {
         crawlLog("  TIMED OUT!!!");
     }
     if ($timer) {
         crawlLog("  Page Request time " . changeInMicrotime($start_time));
     }
     $start_time = microtime();
     //Process returned pages
     for ($i = 0; $i < $num_sites; $i++) {
         if ($timer) {
             crawlTimeoutLog("fetch_url initial processing of page %s of %s", $i, $num_sites);
         }
         if (!$minimal && isset($ip_holder[$i])) {
             rewind($ip_holder[$i]);
             $header = fread($ip_holder[$i], 8192);
             $ip_addresses = self::getCurlIp($header);
             fclose($ip_holder[$i]);
         }
         $is_gopher = false;
         if (isset($sites[$i][0]) && $sites[$i][0]) {
             // Get Data and Message Code
             $content = @curl_multi_getcontent($sites[$i][0]);
             $is_gopher = $sites[$i][CrawlConstants::IS_GOPHER_URL];
             /*
                If the Transfer-encoding was chunked then the Range header
                we sent was ignored. So we manually truncate the data
                here
             */
             if ($page_range_request > 0) {
                 $content = substr($content, 0, $page_range_request);
             }
             if (isset($content) && !$minimal && !$is_gopher) {
                 $site = self::parseHeaderPage($content, $value);
                 $sites[$i] = array_merge($sites[$i], $site);
                 if (isset($header)) {
                     $header = substr($header, 0, strpos($header, "\r\n\r\n") + 4);
                 } else {
                     $header = "";
                 }
                 $sites[$i][CrawlConstants::HEADER] = $header . $sites[$i][CrawlConstants::HEADER];
                 unset($header);
             } else {
                 if (isset($content) && !$minimal && $is_gopher) {
                     $sites[$i][CrawlConstants::HEADER] = $header;
                     $sites[$i][$value] = $content;
                     unset($header);
                 } else {
                     $sites[$i][$value] = $content;
                 }
             }
             if (!$minimal) {
                 $sites[$i][self::SIZE] = @curl_getinfo($sites[$i][0], CURLINFO_SIZE_DOWNLOAD);
                 $sites[$i][self::DNS_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_NAMELOOKUP_TIME);
                 $sites[$i][self::TOTAL_TIME] = @curl_getinfo($sites[$i][0], CURLINFO_TOTAL_TIME);
                 $sites[$i][self::HTTP_CODE] = curl_getinfo($sites[$i][0], CURLINFO_HTTP_CODE);
                 if (!$sites[$i][self::HTTP_CODE] && !$is_gopher) {
                     $sites[$i][self::HTTP_CODE] = curl_error($sites[$i][0]);
                 } else {
                     $sites[$i][self::HTTP_CODE] = 200;
                 }
                 if ($ip_addresses) {
                     $sites[$i][self::IP_ADDRESSES] = $ip_addresses;
                 } else {
                     $sites[$i][self::IP_ADDRESSES] = array("0.0.0.0");
                 }
                 //Get Time, Mime type and Character encoding
                 $sites[$i][self::TIMESTAMP] = time();
                 if ($is_gopher) {
                     $path = UrlParser::getPath($sites[$i][self::URL]);
                     $filename = UrlParser::getDocumentFilename($sites[$i][self::URL]);
                     if (isset($path[1])) {
                         $gopher_type = $path[1];
                     } else {
                         $gopher_type = 1;
                     }
                     if ($gopher_type == 1) {
                         $sites[$i][self::TYPE] = "text/gopher";
                     } else {
                         if (in_array($gopher_type, array(0, 3, 6))) {
                             $sites[$i][self::TYPE] = "text/plain";
                             if ($gopher_type == 6) {
                                 $sites[$i][$value] = convert_uudecode($content);
                             }
                         } else {
                             if ($gopher_type == 'h') {
                                 $sites[$i][self::TYPE] = "text/html";
                             } else {
                                 if ($gopher_type == 'g') {
                                     $sites[$i][self::TYPE] = "image/gif";
                                 }
                             }
                         }
                     }
                     $path_info = pathinfo($filename);
                     if (!isset($sites[$i][self::TYPE]) && isset($path_info['extension'])) {
                         $sites[$i][self::TYPE] = UrlParser::guessMimeTypeFromFileName($filename);
                     } else {
                         if (!isset($sites[$i][self::TYPE])) {
                             $sites[$i][self::TYPE] = "unknown";
                         }
                     }
                 } else {
                     $type_parts = explode(";", curl_getinfo($sites[$i][0], CURLINFO_CONTENT_TYPE));
                     $sites[$i][self::TYPE] = strtolower(trim($type_parts[0]));
                 }
             }
             //curl_multi_remove_handle($agent_handler, $sites[$i][0]);
             curl_close($sites[$i][0]);
             if (isset($sites[$i]['ROBOT']) && $sites[$i]['ROBOT']) {
                 if (isset($sites[$i][self::TYPE]) && $sites[$i][self::TYPE] != "text/plain" && isset($sites[$i][CrawlConstants::LOCATION]) && count($site[CrawlConstants::LOCATION]) > 0) {
                     $sites[$i][self::TYPE] = "text/plain";
                     $sites[$i][self::HTTP_CODE] = "200";
                     $tmp = wordwrap($sites[$i][$value], 80);
                     $tmp_parts = explode("\n", $tmp);
                     $tmp = "# Suspect server misconfiguration\n";
                     $tmp .= "# Assume shouldn't crawl this site.\n";
                     $tmp .= "# Pretending got following robots.txt.\n";
                     $tmp .= "User-agent: *\n";
                     $tmp .= "Disallow: /\n";
                     $tmp .= "# Original error code: " . $sites[$i][self::HTTP_CODE] . "\n";
                     $tmp .= "# Original content:\n";
                     foreach ($tmp_parts as $part) {
                         $tmp = "#" . $part . "\n";
                     }
                     $sites[$i][$value] = $tmp;
                     $sites[$i][self::HTTP_CODE] = "200";
                     unset($site[CrawlConstants::LOCATION]);
                 }
             }
         }
         //end big if
     }
     //end for
     if ($timer) {
         crawlLog("  Get Page Content time " . changeInMicrotime($start_time));
     }
     curl_multi_close($agent_handler);
     return $sites;
 }