Beispiel #1
0
 /**
  * Computes based on the request the folder that should be used to
  * find a file during a resource get request. It also checks if user
  * has access to the requested folder.
  *
  * @return mixed either a string with the folder name in it or false if
  *      the user does not have access or that folder does not exist.
  */
 function getBaseFolder()
 {
     $folder = $this->clean($_REQUEST['f'], 'string');
     $base_dir = APP_DIR . "/{$folder}";
     $add_to_path = false;
     if (isset($_REQUEST['s']) && $folder == "resources") {
         // handle sub-folders of resource (must be numeric)
         $subfolder = $this->clean($_REQUEST['s'], "hash");
         $prefix_folder = substr($subfolder, 0, 3);
         $add_to_path = true;
     } else {
         if (isset($_REQUEST['g'])) {
             $user_id = isset($_SESSION['USER_ID']) ? $_SESSION['USER_ID'] : PUBLIC_USER_ID;
             if (isset($_REQUEST['p'])) {
                 $page_id = $this->clean($_REQUEST['p'], 'int');
             }
             $group_id = $this->clean($_REQUEST['g'], "int");
             $group_model = $this->model('group');
             $group = $group_model->getGroupById($group_id, $user_id);
             if (!$group) {
                 return false;
             }
             $hash_word = isset($_REQUEST['t']) ? 'thumb' : 'group';
             $subfolder = crawlHash($hash_word . $group_id . $page_id . AUTH_KEY);
             $prefix_folder = substr($subfolder, 0, 3);
             $add_to_path = true;
         }
     }
     if ($add_to_path) {
         $base_dir .= "/{$prefix_folder}/{$subfolder}";
     }
     return $base_dir;
 }
Beispiel #2
0
 /**
  * Test how fast insertion and deletions can be done
  */
 function timingTestCase()
 {
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->insert(crawlHash("hi{$i}", true), "0000" . packInt($i));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Insert 10000 into table of size 20000 takes less than 2 seconds");
     $start_time = microtime();
     for ($i = 0; $i < 10000; $i++) {
         $this->test_objects['FILE1']->delete(crawlHash("hi{$i}", true));
     }
     $this->assertTrue(changeInMicrotime($start_time) < 2, "Delete 10000 from table of size 20000 takes less than 2 seconds");
 }
Beispiel #3
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
 /**
  * Gets the next doc from the iterator
  * @param bool $no_process do not do any processing on page data
  * @return array associative array for doc or string if no_process true
  */
 function nextPage($no_process = false)
 {
     static $minimal_regexes = false;
     static $first_call = true;
     if ($first_call) {
         $this->initializeSubstitutions($this->header['base_address']);
     }
     $page_info = $this->getNextTagData("page");
     if ($no_process) {
         return $page_info;
     }
     $dom = new DOMDocument();
     @$dom->loadXML($page_info);
     $site = array();
     $pre_url = $this->getTextContent($dom, "/page/title");
     $pre_url = str_replace(" ", "_", $pre_url);
     $site[self::URL] = $this->header['base_address'] . $pre_url;
     $site[self::IP_ADDRESSES] = array($this->header['ip_address']);
     $pre_timestamp = $this->getTextContent($dom, "/page/revision/timestamp");
     $site[self::MODIFIED] = date("U", strtotime($pre_timestamp));
     $site[self::TIMESTAMP] = time();
     $site[self::TYPE] = "text/html";
     $site[self::HEADER] = "mediawiki_bundle_iterator extractor";
     $site[self::HTTP_CODE] = 200;
     $site[self::ENCODING] = "UTF-8";
     $site[self::SERVER] = "unknown";
     $site[self::SERVER_VERSION] = "unknown";
     $site[self::OPERATING_SYSTEM] = "unknown";
     $site[self::PAGE] = "<html lang='" . $this->header['lang'] . "' >\n" . "<head><title>{$pre_url}</title>\n" . WIKI_PAGE_STYLES . "\n</head>\n" . "<body><h1>{$pre_url}</h1>\n";
     $pre_page = $this->getTextContent($dom, "/page/revision/text");
     $current_hash = crawlHash($pre_page);
     if ($first_call) {
         $this->saveCheckPoint();
         //ensure we remember to advance one on fail
         $first_call = false;
     }
     $pre_page = $this->parser->parse($pre_page, false, true);
     $pre_page = preg_replace("/{{Other uses}}/i", "<div class='indent'>\"\$1\". (<a href='" . $site[self::URL] . "_(disambiguation)'>{$pre_url}</a>)</div>", $pre_page);
     $site[self::PAGE] .= $pre_page;
     $site[self::PAGE] .= "\n</body>\n</html>";
     $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
     $site[self::WEIGHT] = ceil(max(log(strlen($site[self::PAGE]) + 1, 2) - 10, 1));
     return $site;
 }
Beispiel #5
0
 /**
  * Adds a file with contents $data and with name containing $address and
  * $time to a subfolder $day of a folder $dir
  *
  * @param string $schedule_name the name of the kind of schedule being saved
  * @param string& $data_string encoded, compressed, serialized data the
  *     schedule is to contain
  */
 function addScheduleToScheduleDirectory($schedule_name, &$data_string)
 {
     $crawl_time = substr($this->clean($_REQUEST['crawl_time'], "int"), 0, TIMESTAMP_LEN);
     $dir = CRAWL_DIR . "/schedules/" . $schedule_name . $crawl_time;
     $address = str_replace(".", "-", $_SERVER['REMOTE_ADDR']);
     $address = str_replace(":", "_", $address);
     $time = time();
     $day = floor($time / ONE_DAY);
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     $dir .= "/{$day}";
     if (!file_exists($dir)) {
         mkdir($dir);
         chmod($dir, 0777);
     }
     $data_hash = crawlHash($data_string);
     file_put_contents($dir . "/At" . $time . "From" . $address . "WithHash{$data_hash}.txt", $data_string);
 }
Beispiel #6
0
 /**
  * Produces a schedule.txt file of url data for a fetcher to crawl next.
  *
  * The hard part of scheduling is to make sure that the overall crawl
  * process obeys robots.txt files. This involves checking the url is in
  * an allowed path for that host and it also involves making sure the
  * Crawl-delay directive is respected. The first fetcher that contacts the
  * server requesting data to crawl will get the schedule.txt
  * produced by produceFetchBatch() at which point it will be unlinked
  * (these latter thing are controlled in FetchController).
  *
  * @see FetchController
  */
 function produceFetchBatch()
 {
     $i = 1;
     // array implementation of priority queue starts at 1 not 0
     $fetch_size = 0;
     crawlLog("Scheduler: Start Produce Fetch Batch Memory usage" . memory_get_usage());
     $count = $this->web_queue->to_crawl_queue->count;
     $schedule_time = time();
     $first_line = $this->calculateScheduleMetaInfo($schedule_time);
     $sites = array();
     $delete_urls = array();
     $crawl_delay_hosts = array();
     $time_per_request_guess = MINIMUM_FETCH_LOOP_TIME;
     // it would be impressive if we can achieve this speed
     $current_crawl_index = -1;
     crawlLog("Scheduler: Trying to Produce Fetch Batch; Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     /*
         $delete - array of items we will delete from the queue after
             we have selected all of the items for fetch batch
         $sites - array of urls for fetch batch indices in this array we'll
             call slots. Crawled-delayed host urls are spaced by a certain
             number of slots
     */
     while ($i <= $count && $fetch_size < MAX_FETCH_SIZE) {
         crawlTimeoutLog("..Scheduler: still producing fetch batch. " . "Examining location %s in queue of %s.", $i, $count);
         //look in queue for url and its weight
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         // if queue error remove entry any loop
         if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
             $delete_urls[$i] = false;
             crawlLog("Scheduler: Removing lookup error at" . " {$i} during produce fetch");
             $i++;
             continue;
         }
         $no_flags = false;
         $hard_coded = false;
         $host_url = UrlParser::getHost($url);
         if ($flag == WebQueueBundle::NO_FLAGS) {
             $hard_coded_pos = strpos($url, "###!");
             if ($hard_coded_pos > 0) {
                 $has_robots = true;
                 $hard_coded = true;
                 $is_robot = false;
             } else {
                 $has_robots = $this->web_queue->containsGotRobotTxt($host_url);
                 $scheme = UrlParser::getScheme($host_url);
                 if ($scheme == "gopher") {
                     $is_robot = strcmp($host_url . "/0/robots.txt", $url) == 0;
                 } else {
                     $is_robot = strcmp($host_url . "/robots.txt", $url) == 0;
                 }
             }
             $no_flags = true;
         } else {
             $is_robot = $flag == WebQueueBundle::ROBOT;
             if ($flag >= WebQueueBundle::SCHEDULABLE) {
                 $has_robots = true;
                 if ($flag > WebQueueBundle::SCHEDULABLE) {
                     $delay = $flag - WebQueueBundle::SCHEDULABLE;
                 }
             }
         }
         //if $url is a robots.txt url see if we need to schedule or not
         if ($is_robot) {
             if ($has_robots) {
                 $delete_urls[$i] = $url;
                 $i++;
             } else {
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     /* note don't add to seen url filter
                          since check robots every 24 hours as needed
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                     $i++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::ROBOT);
                     }
                 }
             }
             continue;
         }
         //Now handle the non-robots.txt url case
         $robots_okay = true;
         if ($has_robots) {
             if ($no_flags) {
                 if (!isset($hard_coded) || !$hard_coded) {
                     $robots_okay = $this->web_queue->checkRobotOkay($url);
                 } else {
                     $robots_okay = true;
                 }
                 if (!$robots_okay) {
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     $i++;
                     continue;
                 }
                 $delay = $this->web_queue->getCrawlDelay($host_url);
             }
             if (!$this->withinQuota($url)) {
                 //we've not allowed to schedule $url till next hour
                 $delete_urls[$i] = $url;
                 //delete from queue (so no clog) but don't mark seen
                 $i++;
                 continue;
             }
             //each host has two entries in $this->waiting_hosts
             $num_waiting = floor(count($this->waiting_hosts) / 2);
             if ($delay > 0) {
                 // handle adding a url if there is a crawl delay
                 $hash_host = crawlHash($host_url);
                 $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
                 if (!$is_waiting_host && $num_waiting < MAX_WAITING_HOSTS || $is_waiting_host && $this->waiting_hosts[$hash_host] == $schedule_time) {
                     $this->waiting_hosts[$hash_host] = $schedule_time;
                     $this->waiting_hosts[$schedule_time][] = $hash_host;
                     $request_batches_per_delay = ceil($delay / $time_per_request_guess);
                     if (!isset($crawl_delay_hosts[$hash_host])) {
                         $next_earliest_slot = $current_crawl_index;
                         $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
                     } else {
                         $next_earliest_slot = $crawl_delay_hosts[$hash_host] + $request_batches_per_delay * NUM_MULTI_CURL_PAGES;
                     }
                     if (($next_slot = $this->getEarliestSlot($next_earliest_slot, $sites)) < MAX_FETCH_SIZE) {
                         $crawl_delay_hosts[$hash_host] = $next_slot;
                         $delete_urls[$i] = $url;
                         $sites[$next_slot] = array($url, $weight, $delay);
                         $this->web_queue->addSeenUrlFilter($url);
                         /* we might miss some sites by marking them
                              seen after only scheduling them
                            */
                         $fetch_size++;
                     } else {
                         if ($no_flags) {
                             $this->web_queue->setQueueFlag($url, $delay + WebQueueBundle::SCHEDULABLE);
                         }
                     }
                 } else {
                     if (!$is_waiting_host) {
                         // has crawl delay but too many already waiting
                         $delete_urls[$i] = $url;
                         //delete from queue (so no clog) but don't mark seen
                         $i++;
                         continue;
                     }
                 }
             } else {
                 // add a url no crawl delay
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     /* we might miss some sites by marking them
                          seen after only scheduling them
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::SCHEDULABLE);
                     }
                 }
             }
             //if delay else
         }
         // if containsGotRobotTxt
         // handle robots.txt urls
         $i++;
     }
     //end while
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting URLS for fetch batch time " . "so far:" . changeInMicrotime($start_time));
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} URLS for fetch batch from " . "queue in time: " . changeInMicrotime($new_time));
     $new_time = microtime();
     if (isset($sites) && count($sites) > 0) {
         $dummy_slot = array(self::DUMMY, 0.0, 0);
         /* dummy's are used for crawl delays of sites with longer delays
              when we don't have much else to crawl.
            */
         $cnt = 0;
         for ($j = 0; $j < MAX_FETCH_SIZE; $j++) {
             if (isset($sites[$j])) {
                 $cnt++;
                 if ($cnt == $fetch_size) {
                     break;
                 }
             } else {
                 if ($j % NUM_MULTI_CURL_PAGES == 0) {
                     $sites[$j] = $dummy_slot;
                 }
             }
         }
         ksort($sites);
         //write schedule to disk
         $fh = fopen(CRAWL_DIR . "/schedules/" . self::schedule_name . $this->crawl_time . ".txt", "wb");
         fwrite($fh, $first_line);
         $num_sites = count($sites);
         $k = 0;
         foreach ($sites as $site) {
             crawlTimeoutLog("..Scheduler: Still Writing fetch schedule %s" . " of %s.", $k, $num_sites);
             $k++;
             $extracted_etag = NULL;
             list($url, $weight, $delay) = $site;
             $key = crawlHash($url, true);
             if (USE_ETAG_EXPIRES) {
                 /*check if we have cache validation data for a URL. If both
                    ETag and Expires timestamp are found or only an expires
                    timestamp is found, the timestamp is compared with the current
                    time. If the current time is less than the expires timestamp,
                    the URL is not added to the fetch batch. If only an ETag is
                    found, the ETag is appended to the URL so that it can be
                    processed by the fetcher.
                   */
                 $value = $this->web_queue->etag_btree->findValue($key);
                 if ($value !== NULL) {
                     $cache_validation_data = $value[1];
                     if ($cache_validation_data['etag'] !== -1 && $cache_validation_data['expires'] !== -1) {
                         $expires_timestamp = $cache_validation_data['expires'];
                         $current_time = time();
                         if ($current_time < $expires_timestamp) {
                             continue;
                         } else {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         }
                     } else {
                         if ($cache_validation_data['etag'] !== -1) {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         } else {
                             if ($cache_validation_data['expires'] !== -1) {
                                 $expires_timestamp = $cache_validation_data['expires'];
                                 $current_time = time();
                                 if ($current_time < $expires_timestamp) {
                                     continue;
                                 }
                             }
                         }
                     }
                 }
             }
             $host_url = UrlParser::getHost($url);
             $dns_lookup = $this->web_queue->dnsLookup($host_url);
             if ($dns_lookup) {
                 $url .= "###" . urlencode($dns_lookup);
             }
             if ($extracted_etag !== NULL) {
                 $url .= $extracted_etag;
             }
             $out_string = base64_encode(packFloat($weight) . packInt($delay) . $url) . "\n";
             fwrite($fh, $out_string);
         }
         fclose($fh);
         crawlLog("...Scheduler: Sort URLS and write schedule time: " . changeInMicrotime($new_time));
         crawlLog("Scheduler: End Produce Fetch Batch Memory usage" . memory_get_usage());
         crawlLog("Scheduler: Created fetch batch of size {$num_sites}." . " {$num_deletes} urls were deleted." . " Queue size is now " . $this->web_queue->to_crawl_queue->count . "...Total Time to create batch: " . changeInMicrotime($start_time));
     } else {
         crawlLog("Scheduler: No fetch batch created!! " . "Time failing to make a fetch batch:" . changeInMicrotime($start_time) . ". Loop properties:{$i} {$count}" . " {$num_deletes} urls were deleted in failed attempt.");
         $max_links = max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP);
         if ($num_deletes < 5 && $i >= $count && $count >= NUM_URLS_QUEUE_RAM - SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
             crawlLog("Scheduler: Queue Full and Couldn't produce Fetch " . "Batch!! Or Delete any URLS!!!");
             crawlLog("Scheduler: Rescheduling Queue Contents " . "(not marking seen) to try to unjam!");
             $this->dumpQueueToSchedules(true);
             $this->clearWebQueue();
         }
     }
 }
Beispiel #7
0
 /**
  * Get a summary of a document by the generation it is in
  * and its offset into the corresponding WebArchive.
  *
  * @param string $url of summary we are trying to look-up
  * @param array $machine_urls an array of urls of yioop queue servers
  * @param string $index_name timestamp of the index to do the lookup in
  * @return array summary data of the matching document
  */
 function getCrawlItem($url, $machine_urls = NULL, $index_name = "")
 {
     $hash_url = crawlHash($url, true);
     if ($index_name == "") {
         $index_name = $this->index_name;
     }
     $results = $this->getCrawlItems(array($hash_url => array($url, $index_name)), $machine_urls);
     if (isset($results[$hash_url])) {
         return $results[$hash_url];
     }
     return $results;
 }
Beispiel #8
0
 /**
  * Save/updates/deletes an override of a search engine result summary
  * page. The information stored will be used instead of what was actually
  * in the index when it comes to displaying search results for a page.
  * It will not be used for looking up results.
  *
  * @param string $url url of a result page
  * @param string $title the title to be used on SERP pages
  * @param string $description the description from which snippets will
  *     be generated.
  */
 function updateResultPage($url, $title, $description)
 {
     $result_pages = array();
     $file_name = $this->dir_name . "/result_pages.txt";
     if (file_exists($file_name)) {
         $result_pages = unserialize(file_get_contents($file_name));
     }
     $hash_url = crawlHash($url, true);
     if ($title == "" && $description == "") {
         unset($result_pages[$hash_url]);
     } else {
         $result_pages[$hash_url] = array(self::URL => $url, self::TITLE => $title, self::DESCRIPTION => $description);
     }
     file_put_contents($file_name, serialize($result_pages));
 }
Beispiel #9
0
 /**
  * Computes for each word in an array of words a count of the total number
  * of times it occurs in this crawl model's default index.
  *
  * @param array $words words to find the counts for
  * @param array $machine_urls machines to invoke this command on
  * @return array associative array of word => counts
  */
 function countWords($words, $machine_urls = NULL)
 {
     if ($machine_urls != NULL && !$this->isSingleLocalhost($machine_urls)) {
         $count_strings = $this->execMachines("countWords", $machine_urls, serialize(array($words, $this->index_name)));
         $word_counts = array();
         foreach ($count_strings as $count_string) {
             $a_word_counts = unserialize(webdecode($count_string[self::PAGE]));
             if (is_array($a_word_counts)) {
                 foreach ($a_word_counts as $word => $count) {
                     $word_counts[$word] = isset($word_counts[$word]) ? $word_counts[$word] + $count : $count;
                 }
             }
         }
         return $word_counts;
     }
     $index_archive = IndexManager::getIndex($this->index_name);
     $hashes = array();
     $lookup = array();
     foreach ($words as $word) {
         $tmp = crawlHash($word);
         $hashes[] = $tmp;
         $lookup[$tmp] = $word;
     }
     $word_key_counts = $index_archive->countWordKeys($hashes);
     $phrases = array();
     $word_counts = array();
     if (is_array($word_key_counts) && count($word_key_counts) > 0) {
         foreach ($word_key_counts as $word_key => $count) {
             $word_counts[$lookup[$word_key]] = $count;
         }
     }
     return $word_counts;
 }
Beispiel #10
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Beispiel #11
0
 /**
  * Formats a cache of a web page (adds history ui and highlight keywords)
  *
  * @param array $cache_item details meta information about the cache page
  * @param string $cache_file contains current web page before formatting
  * @param string $url that cache web page was originally from
  * @param string $summary_string summary data that was extracted from the
  *     web page to be put in the actually inverted index
  * @param int $crawl_time timestamp of crawl cache page was from
  * @param array $all_crawl_times timestamps of all crawl times currently
  *     in Yioop system
  * @param string $terms from orginal query responsible for cache request
  * @param array $ui_flags array of  ui features which
  *     should be added to the cache page. For example, "highlight"
  *     would way search terms should be highlighted, "history"
  *     says add history navigation for all copies of this cache page in
  *     yioop system.
  * return string of formatted cached page
  */
 function formatCachePage($cache_item, $cache_file, $url, $summary_string, $crawl_time, $all_crawl_times, $terms, $ui_flags)
 {
     //Check if it the URL is from the UI
     $hist_ui_open = in_array("hist_ui_open", $ui_flags) ? true : false;
     $date = date("F d Y H:i:s", $cache_item[self::TIMESTAMP]);
     $meta_words = PhraseParser::$meta_words_list;
     foreach ($meta_words as $meta_word) {
         $pattern = "/(\\b)({$meta_word}(\\S)+)/";
         $terms = preg_replace($pattern, "", $terms);
     }
     $terms = str_replace("'", " ", $terms);
     $terms = str_replace('"', " ", $terms);
     $terms = str_replace('\\', " ", $terms);
     $terms = str_replace('|', " ", $terms);
     $terms = $this->clean($terms, "string");
     $phrase_string = mb_ereg_replace("[[:punct:]]", " ", $terms);
     $words = mb_split(" ", $phrase_string);
     if (!in_array("highlight", $ui_flags)) {
         $words = array();
     }
     $dom = new DOMDocument();
     restore_error_handler();
     $did_dom = @$dom->loadHTML('<?xml encoding="UTF-8">' . $cache_file);
     set_error_handler("yioop_error_handler");
     foreach ($dom->childNodes as $item) {
         if ($item->nodeType == XML_PI_NODE) {
             $dom->removeChild($item);
         }
         // remove hack
     }
     $dom->encoding = "UTF-8";
     // insert proper
     $head = $dom->getElementsByTagName('head')->item(0);
     if (is_object($head)) {
         // add a noindex nofollow robot directive to page
         $head_first_child = $head->firstChild;
         $robot_node = $dom->createElement('meta');
         $robot_node = $head->insertBefore($robot_node, $head_first_child);
         $robot_node->setAttribute("name", "ROBOTS");
         $robot_node->setAttribute("content", "NOINDEX,NOFOLLOW");
         $comment = $dom->createComment(tl('search_controller_cache_comment'));
         $comment = $head->insertBefore($comment, $robot_node);
         // make link and script links absolute
         $head = $this->canonicalizeLinks($head, $url);
     } else {
         $body_tags = "<frameset><frame><noscript><img><span><b><i><em>" . "<strong><h1><h2><h3><h4><h5><h6><p><div>" . "<a><table><tr><td><th><dt><dir><dl><dd><pre>";
         $cache_file = strip_tags($cache_file, $body_tags);
         $cache_file = wordwrap($cache_file, 80);
         $cache_file = "<html><head><title>" . tl('search_controller_yioop_cache') . "</title></head>" . "<body>" . $cache_file . "</body></html>";
         $dom = new DOMDocument();
         restore_error_handler();
         @$dom->loadHTML($cache_file);
         set_error_handler("yioop_error_handler");
     }
     $body = $dom->getElementsByTagName('body')->item(0);
     //make tags in body absolute
     $body = $this->canonicalizeLinks($body, $url);
     $first_child = $body->firstChild;
     $text_align = getLocaleDirection() == 'ltr' ? "left" : "right";
     // add information about what was extracted from page
     if (in_array("summaries", $ui_flags)) {
         $summary_toggle_node = $this->createSummaryAndToggleNodes($dom, $text_align, $body, $summary_string, $cache_item);
     } else {
         $summary_toggle_node = $first_child;
     }
     if (isset($cache_item[self::KEYWORD_LINKS]) && count($cache_item[self::KEYWORD_LINKS]) > 0) {
         $keyword_node = $this->createDomBoxNode($dom, $text_align, "zIndex: 1");
         $text_node = $dom->createTextNode("Z@key_links@Z");
         $keyword_node->appendChild($text_node);
         $keyword_node = $body->insertBefore($keyword_node, $summary_toggle_node);
         $set_key_links = true;
     } else {
         $keyword_node = $summary_toggle_node;
         $set_key_links = false;
     }
     if (in_array("version", $ui_flags)) {
         $version_node = $this->createDomBoxNode($dom, $text_align, "zIndex: 1");
         $textNode = $dom->createTextNode(tl('search_controller_cached_version', "Z@url@Z", $date));
         $version_node->appendChild($textNode);
         $brNode = $dom->createElement('br');
         $version_node->appendChild($brNode);
         $this->addCacheJavascriptTags($dom, $version_node);
         $version_node = $body->insertBefore($version_node, $keyword_node);
     } else {
         $version_node = $keyword_node;
     }
     //UI for showing history
     if (in_array("history", $ui_flags)) {
         $history_node = $this->historyUI($crawl_time, $all_crawl_times, $version_node, $dom, $terms, $hist_ui_open, $url);
     } else {
         $history_node = $dom->createElement('div');
     }
     if ($history_node) {
         $version_node->appendChild($history_node);
     }
     $body = $this->markChildren($body, $words, $dom);
     $new_doc = $dom->saveHTML();
     if (substr($url, 0, 7) != "record:") {
         $url = "<a href='{$url}'>{$url}</a>";
     }
     $new_doc = str_replace("Z@url@Z", $url, $new_doc);
     $colors = array("yellow", "orange", "gray", "cyan");
     $color_count = count($colors);
     $i = 0;
     foreach ($words as $word) {
         //only mark string of length at least 2
         if (mb_strlen($word) > 1) {
             $mark_prefix = crawlHash($word);
             if (stristr($mark_prefix, $word) !== false) {
                 $mark_prefix = preg_replace("/{$word}/i", '', $mark_prefix);
             }
             $match = $mark_prefix . $word;
             $new_doc = preg_replace("/{$match}/i", '<span style="background-color:' . $colors[$i] . '">$0</span>', $new_doc);
             $i = ($i + 1) % $color_count;
             $new_doc = preg_replace("/" . $mark_prefix . "/", "", $new_doc);
         }
     }
     if ($set_key_links) {
         $new_doc = $this->addKeywordLinks($new_doc, $cache_item);
     }
     return $new_doc;
 }
Beispiel #12
0
 /**
  * Adds $item to  FEED_ITEM table in db if it isn't already there
  *
  * @param array $item data from a single news feed item
  * @param string $source_name string name of the news feed $item was found
  * on
  * @param int $age how many seconds old records should be ignored
  * @param string $lang locale-tag of the news feed
  * @return bool whether an item was added
  */
 function addFeedItemIfNew($item, $source_name, $lang, $age)
 {
     if (!isset($item["link"]) || !isset($item["title"]) || !isset($item["description"])) {
         return false;
     }
     if (!isset($item["guid"]) || $item["guid"] == "") {
         $item["guid"] = crawlHash($item["link"]);
     } else {
         $item["guid"] = crawlHash($item["guid"]);
     }
     $raw_guid = unbase64Hash($item["guid"]);
     if (!isset($item["pubDate"]) || $item["pubDate"] == "") {
         $item["pubDate"] = time();
     } else {
         $item["pubDate"] = strtotime($item["pubDate"]);
     }
     if (time() - $item["pubDate"] > $age) {
         return false;
     }
     $sql = "SELECT COUNT(*) AS NUMBER FROM FEED_ITEM WHERE GUID = ?";
     $db = $this->db;
     $result = $db->execute($sql, array($item["guid"]));
     if ($result) {
         $row = $db->fetchArray($result);
         if ($row["NUMBER"] > 0) {
             return false;
         }
     } else {
         return true;
     }
     $sql = "INSERT INTO FEED_ITEM VALUES (?, ?, ?, ?, ?, ?)";
     $result = $db->execute($sql, array($item['guid'], $item['title'], $item['link'], $item['description'], $item['pubDate'], $source_name));
     if (!$result) {
         return false;
     }
     return true;
 }
Beispiel #13
0
 /**
  * Computes a hash of a string containing page data for use in
  * deduplication of pages with similar content
  *
  * @param string& $page reference to web page data
  * @return string 8 byte hash to identify page contents
  */
 static function computePageHash(&$page)
 {
     /* to do dedup we strip script, noscript, and style tags
          as well as their content, then we strip tags, get rid
          of whitespace and hash
        */
     $strip_array = array('@<script[^>]*?>.*?</script>@si', '@<noscript[^>]*?>.*?</noscript>@si', '@<style[^>]*?>.*?</style>@si');
     $dedup_string = preg_replace($strip_array, '', $page);
     $dedup_string_old = preg_replace('/\\W+/', '', $dedup_string);
     $dedup_string = strip_tags($dedup_string_old);
     if ($dedup_string == "") {
         $dedup_string = $dedup_string_old;
     }
     $dedup_string = preg_replace('/\\W+/', '', $dedup_string);
     return crawlHash($dedup_string, true);
 }
Beispiel #14
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Beispiel #15
0
 /**
  * Given an array page summaries, for each summary extracts snippets which
  * are related to a set of search words. For each snippet, bold faces the
  * search terms, and then creates a new summary array.
  *
  * @param array $results web pages summaries (these in turn are
  *     arrays!)
  * @param array $words keywords (typically what was searched on)
  * @param int $description_length length of the description
  * @return array summaries which have been snippified and bold faced
  */
 function formatPageResults($results, $words = NULL, $description_length = self::DEFAULT_DESCRIPTION_LENGTH)
 {
     if (isset($results['PAGES'])) {
         $pages = $results['PAGES'];
         $num_pages = count($pages);
     } else {
         $output['TOTAL_ROWS'] = 0;
         $output['PAGES'] = NULL;
         return;
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $page = $pages[$i];
         if ($this->edited_page_summaries != NULL) {
             $url_parts = explode("|", $page[self::URL]);
             if (count($url_parts) > 1) {
                 $url = trim($url_parts[1]);
             } else {
                 $url = $page[self::URL];
             }
             $hash_url = crawlHash($url, true);
             if (isset($this->edited_page_summaries[$hash_url])) {
                 $summary = $this->edited_page_summaries[$hash_url];
                 $page[self::URL] = $url;
                 foreach (array(self::TITLE, self::DESCRIPTION) as $field) {
                     if (isset($summary[$field])) {
                         $page[$field] = $summary[$field];
                     }
                 }
             }
         }
         if (!isset($page[self::TITLE])) {
             $page[self::TITLE] = "";
         }
         $page[self::TITLE] = strip_tags($page[self::TITLE]);
         if (strlen($page[self::TITLE]) == 0) {
             $offset = min(mb_strlen($page[self::DESCRIPTION]), TITLE_LENGTH);
             $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset);
             $ellipsis = "";
             if ($end_title > TITLE_LENGTH) {
                 $ellipsis = "...";
                 if ($end_title > MAX_TITLE_LENGTH) {
                     $end_title = MAX_TITLE_LENGTH;
                 }
             }
             $page[self::TITLE] = mb_substr(strip_tags($page[self::DESCRIPTION]), 0, $end_title) . $ellipsis;
             //still no text revert to url
             if (strlen($page[self::TITLE]) == 0 && isset($page[self::URL])) {
                 $page[self::TITLE] = $page[self::URL];
             }
         }
         // do a little cleaning on text
         if ($words != NULL) {
             $page[self::TITLE] = $this->boldKeywords($page[self::TITLE], $words);
             if (!isset($page[self::IS_FEED])) {
                 $page[self::DESCRIPTION] = $this->getSnippets(strip_tags($page[self::DESCRIPTION]), $words, $description_length);
             }
             $page[self::DESCRIPTION] = $this->boldKeywords($page[self::DESCRIPTION], $words);
         } else {
             $page[self::DESCRIPTION] = mb_substr(strip_tags($page[self::DESCRIPTION]), 0, $description_length);
         }
         $page[self::SCORE] = mb_substr($page[self::SCORE], 0, SCORE_PRECISION);
         $pages[$i] = $page;
     }
     $output['TOTAL_ROWS'] = $results['TOTAL_ROWS'];
     $output['PAGES'] = $pages;
     return $output;
 }
Beispiel #16
0
 /**
  * Used to compute all the meta ids for a given link with $url
  * and $link_text that was on a site with $site_url.
  *
  * @param string $url url of the link
  * @param string $link_host url of the host name of the link
  * @param string $link_text text of the anchor tag link came from
  * @param string $site_url url of the page link was on
  */
 static function calculateLinkMetas($url, $link_host, $link_text, $site_url)
 {
     global $IMAGE_TYPES;
     $link_meta_ids = array();
     if (strlen($link_host) == 0) {
         continue;
     }
     if (substr($link_text, 0, 9) == "location:") {
         $location_link = true;
         $link_meta_ids[] = $link_text;
         $link_meta_ids[] = "location:all";
         $link_meta_ids[] = "location:" . crawlHash($site_url);
     }
     $link_type = UrlParser::getDocumentType($url);
     $link_meta_ids[] = "media:all";
     $link_meta_ids[] = "safe:all";
     if (in_array($link_type, $IMAGE_TYPES)) {
         $link_meta_ids[] = "media:image";
         if (isset($safe) && !$safe) {
             $link_meta_ids[] = "safe:false";
         }
     } else {
         $link_meta_ids[] = "media:text";
     }
     $link_meta_ids[] = "link:all";
     return $link_meta_ids;
 }
Beispiel #17
0
 /**
  * Since offsets are integers, even if the queue is kept relatively small,
  * periodically we will need to rebuild the archive for storing urls.
  */
 function rebuildUrlTable()
 {
     crawlLog("Rebuilding URL table");
     $dir_name = $this->dir_name;
     $count = $this->to_crawl_queue->count;
     $tmp_archive_name = $dir_name . "/tmp_archive" . NonCompressor::fileExtension();
     $url_archive_name = $dir_name . "/url_archive" . NonCompressor::fileExtension();
     $tmp_archive = new WebArchive($tmp_archive_name, new NonCompressor(), false, true);
     for ($i = 1; $i <= $count; $i++) {
         list($url, $weight, $flag, $probe) = $this->peekQueue($i);
         $url_container = array(array($url));
         $objects = $tmp_archive->addObjects("offset", $url_container);
         if (isset($objects[0]['offset'])) {
             $offset = $objects[0]['offset'];
         } else {
             crawlLog("Error inserting {$url} into rebuild url archive file");
             continue;
         }
         $hash_url = crawlHash($url, true);
         $data = packInt($offset) . packInt($i) . packInt($flag);
         $this->insertHashTable($hash_url, $data, $probe);
     }
     $this->to_crawl_archive = NULL;
     gc_collect_cycles();
     $tmp_archive->filename = $url_archive_name;
     $this->to_crawl_archive = $tmp_archive;
 }
Beispiel #18
0
 /**
  *  Returns the folder and thumb folder associated with the resources of
  *  a wiki page.
  *
  * @param int $group_id group identifier of group wiki page belongs to
  * @param int $page_id identifier for page want folder paths for
  * @return array (page_folder, thumb_folder)
  */
 function getGroupPageResourcesFolders($group_id, $page_id)
 {
     $group_page_folder = crawlHash("group" . $group_id . $page_id . AUTH_KEY);
     $thumb_page_folder = crawlHash("thumb" . $group_id . $page_id . AUTH_KEY);
     $group_prefix = substr($group_page_folder, 0, 3);
     $thumb_prefix = substr($thumb_page_folder, 0, 3);
     $resource_path = APP_DIR . "/resources";
     $group_prefix_path = $resource_path . "/{$group_prefix}";
     $thumb_prefix_path = $resource_path . "/{$thumb_prefix}";
     $group_path = "{$group_prefix_path}/{$group_page_folder}";
     $thumb_path = "{$thumb_prefix_path}/{$thumb_page_folder}";
     if (file_exists($group_path) && file_exists($thumb_path)) {
         return array($group_path, $thumb_path);
     }
     if (!file_exists(APP_DIR) && !mkdir(APP_DIR)) {
         return false;
     }
     if (!file_exists($resource_path) && !mkdir($resource_path)) {
         return false;
     }
     if (!file_exists($group_prefix_path) && !mkdir($group_prefix_path)) {
         return false;
     }
     if (!file_exists($thumb_prefix_path) && !mkdir($thumb_prefix_path)) {
         return false;
     }
     if ((file_exists($group_path) || mkdir($group_path)) && (file_exists($thumb_path) || mkdir($thumb_path))) {
         return array($group_path, $thumb_path);
     }
 }
Beispiel #19
0
 /**
  * Returns the statuses of machines in the machine table of their
  * fetchers and queue_server as well as the name and url's of these machines
  *
  * @param array $machines an array of machines to check the status for
  * @return array  a list of machines, together with all their properties
  * and the statuses of their fetchers and queue_servers
  */
 function getMachineStatuses($machines = array())
 {
     $num_machines = count($machines);
     $time = time();
     $session = md5($time . AUTH_KEY);
     for ($i = 0; $i < $num_machines; $i++) {
         $hash_url = crawlHash($machines[$i]["URL"]);
         $machines[$i][CrawlConstants::URL] = $machines[$i]["URL"] . "?c=machine&a=statuses&time={$time}" . "&session={$session}&arg={$hash_url}";
     }
     $statuses = FetchUrl::getPages($machines);
     for ($i = 0; $i < $num_machines; $i++) {
         foreach ($statuses as $status) {
             if ($machines[$i][CrawlConstants::URL] == $status[CrawlConstants::URL]) {
                 $pre_status = json_decode($status[CrawlConstants::PAGE], true);
                 if (is_array($pre_status)) {
                     $machines[$i]["STATUSES"] = $pre_status;
                 } else {
                     $machines[$i]["STATUSES"] = "NOT_CONFIGURED_ERROR";
                 }
             }
         }
     }
     $sql = "SELECT * FROM ACTIVE_FETCHER";
     $result = $this->db->execute($sql);
     if (!$result) {
         return $machines;
     }
     $active_fetchers = array();
     while ($row = $this->db->fetchArray($result)) {
         for ($i = 0; $i < $num_machines; $i++) {
             if ($machines[$i]['NAME'] == $row['NAME']) {
                 if (!isset($machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']])) {
                     $machines[$i]["STATUSES"]["fetcher"][$row['FETCHER_ID']] = 0;
                 }
             }
         }
     }
     stringROrderCallback("", "", "NAME");
     if ($machines != array()) {
         usort($machines, "stringROrderCallback");
     }
     $name_server_statuses = CrawlDaemon::statuses();
     $machines['NAME_SERVER']['news_updater'] = 0;
     if (isset($name_server_statuses['news_updater'])) {
         $machines['NAME_SERVER']['news_updater'] = 1;
     }
     return $machines;
 }
Beispiel #20
0
 /**
  * For a collection of grouped pages generates a grouped summary for each
  * group and returns an array of out pages consisting
  * of single summarized documents for each group. These single summarized
  * documents have aggregated scores.
  *
  * @param array& $pre_out_pages array of groups of pages for which out pages
  *     are to be generated.
  * @return array $out_pages array of single summarized documents
  */
 function computeOutPages(&$pre_out_pages)
 {
     $out_pages = array();
     foreach ($pre_out_pages as $hash_url => $group_infos) {
         $out_pages[$hash_url] = $pre_out_pages[$hash_url][0];
         $add_lookup = false;
         if ($this->network_flag) {
             $hash = $out_pages[$hash_url][self::HASH];
             $is_location = crawlHash($hash_url . "LOCATION", true) == $hash;
             if (!$out_pages[$hash_url][self::IS_DOC] || $is_location) {
                 $add_lookup = true;
             }
         }
         $out_pages[$hash_url][self::SUMMARY_OFFSET] = array();
         unset($out_pages[$hash_url][self::GENERATION]);
         $hash_count = $out_pages[$hash_url][self::HASH_URL_COUNT];
         for ($i = 0; $i < $hash_count; $i++) {
             $doc_info = $group_infos[$i];
             if (isset($doc_info[self::GENERATION])) {
                 if (is_int($doc_info[self::SUMMARY_OFFSET])) {
                     $machine_id = isset($doc_info[self::MACHINE_ID]) ? $doc_info[self::MACHINE_ID] : $this->current_machine;
                     $out_pages[$hash_url][self::SUMMARY_OFFSET][] = array($machine_id, $doc_info[self::KEY], $doc_info[self::CRAWL_TIME], $doc_info[self::GENERATION], $doc_info[self::SUMMARY_OFFSET]);
                 } else {
                     if (is_array($doc_info[self::SUMMARY_OFFSET])) {
                         $out_pages[$hash_url][self::SUMMARY_OFFSET] = array_merge($out_pages[$hash_url][self::SUMMARY_OFFSET], $doc_info[self::SUMMARY_OFFSET]);
                     }
                 }
             }
         }
         $out_pages[$hash_url][self::SCORE] = $out_pages[$hash_url][self::HASH_SUM_SCORE];
         if ($add_lookup) {
             $prefix = $is_location ? "location:" : "info:";
             $word_key = $prefix . base64Hash($hash_url);
             array_unshift($out_pages[$hash_url][self::SUMMARY_OFFSET], array($word_key, $group_infos[0][self::CRAWL_TIME]));
         }
     }
     return $out_pages;
 }
Beispiel #21
0
 /**
  * Used to clean strings that might be tainted as originate from the user
  *
  * @param mixed $value tainted data
  * @param string $type type of data in value: one of int, hash, or string
  * @param mixed $default if $value is not set default value is returned,
  *     this isn't used much since if the error_reporting is E_ALL
  *     or -1 you would still get a Notice.
  * @return string the clean input matching the type provided
  */
 function clean($value, $type, $default = NULL)
 {
     $clean_value = NULL;
     switch ($type) {
         case "boolean":
         case "bool":
             if (isset($value)) {
                 if (!is_bool($value)) {
                     $clean_value = false;
                     if ($value == "true" || $value != 0) {
                         $clean_value = true;
                     }
                 }
             } else {
                 if ($default != NULL) {
                     $clean_value = $default;
                 } else {
                     $clean_value = false;
                 }
             }
             break;
         case "color":
             if (isset($value)) {
                 $colors = array("black", "silver", "gray", "white", "maroon", "red", "purple", "fuchsia", "green", "lime", "olive", "yellow", "navy", "blue", "teal", "aqua", "orange", "aliceblue", "antiquewhite", "aquamarine", "azure", "beige", "bisque", "blanchedalmond", "blueviolet", "brown", "burlywood", "cadetblue", "chartreuse", "chocolate", "coral", "cornflowerblue", "cornsilk", "crimson", "darkblue", "darkcyan", "darkgoldenrod", "darkgray", "darkgreen", "darkgrey", "darkkhaki", "darkmagenta", "darkolivegreen", "darkorange", "darkorchid", "darkred", "darksalmon", "darkseagreen", "darkslateblue", "darkslategray", "darkslategrey", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dimgray", "dodgerblue", "firebrick", "floralwhite", "forestgreen", "gainsboro", "ghostwhite", "gold", "goldenrod", "greenyellow", "grey", "honeydew", "hotpink", "indianred", "indigo", "ivory", "khaki", "lavender", "lavenderblush", "lawngreen", "lemonchiffon", "lightblue", "lightcoral", "lightcyan", "lightgoldenrodyellow", "lightgray", "lightgreen", "lightgrey", "lightpink", "lightsalmon", "lightseagreen", "lightskyblue", "lightslategray", "lightslategrey", "lightsteelblue", "lightyellow", "limegreen", "linen", "mediumaquamarine", "mediumblue", "mediumorchid", "mediumpurple", "mediumseagreen", "mediumslateblue", "mediumspringgreen", "mediumturquoise", "mediumvioletred", "midnightblue", "mintcream", "mistyrose", "moccasin", "navajowhite", "oldlace", "olivedrab", "orangered", "orchid", "palegoldenrod", "palegreen", "paleturquoise", "palevioletred", "papayawhip", "peachpuff", "peru", "pink", "plum", "powderblue", "rosybrown", "royalblue", "saddlebrown", "salmon", "sandybrown", "seagreen", "seashell", "sienna", "skyblue", "slateblue", "slategray", "slategrey", "snow", "springgreen", "steelblue", "tan", "thistle", "tomato", "turquoise", "violet", "wheat", "whitesmoke", "yellowgreen", "rebeccapurple");
                 if (in_array($value, $colors) || preg_match('/^#[a-fA-F0-9][a-fA-F0-9][a-fA-F0-9]' . '([a-fA-F0-9][a-fA-F0-9][a-fA-F0-9])?$/', trim($value))) {
                     $clean_value = trim($value);
                 } else {
                     $clean_value = "#FFF";
                 }
             } else {
                 if ($default != NULL) {
                     $clean_value = $default;
                 } else {
                     $clean_value = "#FFF";
                 }
             }
             break;
         case "double":
             if (isset($value)) {
                 $clean_value = doubleval($value);
             } else {
                 if ($default != NULL) {
                     $clean_value = $default;
                 } else {
                     $clean_value = 0;
                 }
             }
             break;
         case "float":
             if (isset($value)) {
                 $clean_value = floatval($value);
             } else {
                 if ($default != NULL) {
                     $clean_value = $default;
                 } else {
                     $clean_value = 0;
                 }
             }
             break;
         case "hash":
             if (isset($value)) {
                 if (strlen($value) == strlen(crawlHash("A")) && base64_decode($value)) {
                     $clean_value = $value;
                 }
             } else {
                 $clean_value = $default;
             }
             break;
         case "int":
             if (isset($value)) {
                 $clean_value = intval($value);
             } else {
                 if ($default != NULL) {
                     $clean_value = $default;
                 } else {
                     $clean_value = 0;
                 }
             }
             break;
         case "string":
             if (isset($value)) {
                 $value2 = str_replace("&amp;", "&", $value);
                 $clean_value = @htmlentities($value2, ENT_QUOTES, "UTF-8");
             } else {
                 $clean_value = $default;
             }
             break;
     }
     return $clean_value;
 }
Beispiel #22
0
 /**
  * Handles admin request related to the search filter activity
  *
  * This activity allows a user to specify hosts whose web pages are to be
  * filtered out the search results
  *
  * @return array $data info about the groups and their contents for a
  *     particular crawl mix
  */
 function resultsEditor()
 {
     $parent = $this->parent;
     $filters_model = $parent->model("searchfilters");
     $data["ELEMENT"] = "resultseditor";
     $data['SCRIPT'] = "";
     if (isset($_REQUEST['disallowed_sites'])) {
         $sites = $parent->convertStringCleanArray($_REQUEST['disallowed_sites']);
         $disallowed_sites = array();
         foreach ($sites as $site) {
             $site = UrlParser::getHost($site);
             if (strlen($site) > 0) {
                 $disallowed_sites[] = $site . "/";
             }
         }
         $data['disallowed_sites'] = implode("\n", $disallowed_sites);
         $filters_model->set($disallowed_sites);
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_update') . "</h1>')";
     }
     if (!isset($data['disallowed_sites'])) {
         $data['disallowed_sites'] = implode("\n", $filters_model->getUrls());
     }
     foreach (array("URL", "TITLE", "DESCRIPTION") as $field) {
         $data[$field] = isset($_REQUEST[$field]) ? $parent->clean($_REQUEST[$field], "string") : (isset($data[$field]) ? $data[$field] : "");
     }
     if ($data["URL"] != "") {
         $data["URL"] = UrlParser::canonicalLink($data["URL"], "");
     }
     $tmp = tl('crawl_component_edited_pages');
     $data["URL_LIST"] = array($tmp => $tmp);
     $summaries = $filters_model->getEditedPageSummaries();
     foreach ($summaries as $hash => $summary) {
         $data["URL_LIST"][$summary[self::URL]] = $summary[self::URL];
     }
     if (isset($_REQUEST['arg'])) {
         switch ($_REQUEST['arg']) {
             case "save_page":
                 $missing_page_field = $data["URL"] == "" ? true : false;
                 if ($missing_page_field) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_need_url') . "</h1>')";
                 } else {
                     $filters_model->updateResultPage($data["URL"], $data["TITLE"], $data["DESCRIPTION"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_page_updated') . "</h1>')";
                 }
                 break;
             case "load_url":
                 $hash_url = crawlHash($_REQUEST['LOAD_URL'], true);
                 if (isset($summaries[$hash_url])) {
                     $data["URL"] = $parent->clean($_REQUEST['LOAD_URL'], "string");
                     $data["TITLE"] = $summaries[$hash_url][self::TITLE];
                     $data["DESCRIPTION"] = $summaries[$hash_url][self::DESCRIPTION];
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_page_loaded') . "</h1>')";
                 }
                 break;
         }
     }
     return $data;
 }
Beispiel #23
0
 /**
  * Using the supplied $word_structs, contructs an iterator for getting
  * results to a query
  *
  * @param array $word_structs an array of word_structs. Here a word_struct
  *     is an associative array with at least the following fields
  *     KEYS -- an array of word keys
  *     QUOTE_POSITIONS -- an array of positions of words that appreared in
  *         quotes (so need to be matched exactly)
  *     DISALLOW_PHRASES -- an array of words the document must not contain
  *     WEIGHT -- a weight to multiple scores returned from this iterator by
  *     INDEX_NAME -- an index timestamp to get results from
  * @param array& $filter an array of hashes of domains to filter from
  *     results
  *     and then potentially restored in cache
  * @param int $raw ($raw == 0) normal grouping, ($raw == 1)
  *     no grouping done on data also no summaries returned (only lookup
  *     info), $raw > 1 return summaries but no grouping
  * @param int $to_retrieve number of items to retrieve from location in
  *     in interator
  * @param array $queue_servers a list of urls of yioop machines which might
  *     be used during lookup
  * @param string $original_query if set, the orginal query that corresponds
  *     to $word_structs
  * @param string $save_timestamp_name if this timestamp is non empty, then
  *     when making iterator get sub-iterators to advance to gen doc_offset
  *     stored with respect to save_timestamp if exists.
  * @param bool $limit_news if true the number of media:news items to
  *     allow in search results is limited to WordIterator::LIMIT_NEWS_COUNT
  *
  * @return &object an iterator for iterating through results to the
  * query
  */
 function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = array(), $original_query = "", $save_timestamp_name = "", $limit_news = true)
 {
     $iterators = array();
     $total_iterators = 0;
     $network_flag = false;
     $min_group_flag = false;
     $min_group_override = false;
     if ($queue_servers != array() && !$this->isSingleLocalhost($queue_servers)) {
         $network_flag = true;
         $total_iterators = 1;
         if (!in_array(NAME_SERVER, $queue_servers)) {
             $queue_servers[] = NAME_SERVER;
             //name server might still have news
         }
         $num_servers = count($queue_servers);
         if ((!isset($this->index_name) || !$this->index_name) && isset($word_structs[0]["INDEX_NAME"])) {
             $index_name = $word_structs[0]["INDEX_NAME"];
         } else {
             $index_name = $this->index_name;
         }
         $iterators[0] = new NetworkIterator($original_query, $queue_servers, $index_name, $filter, $save_timestamp_name, $limit_news);
     }
     if (!$network_flag) {
         $doc_iterate_hashes = array(substr(crawlHashWord("site:any"), 0, 9), substr(crawlHash("site:any"), 0, 9), substr(crawlHashWord("site:doc"), 0, 9), substr(crawlHash("site:doc"), 0, 9));
         if ($save_timestamp_name != "") {
             // used for archive crawls of crawl mixes
             $save_file = CRAWL_DIR . '/schedules/' . self::save_point . $save_timestamp_name . ".txt";
             if (file_exists($save_file)) {
                 $save_point = unserialize(file_get_contents($save_file));
             }
             $save_count = 0;
         }
         foreach ($word_structs as $word_struct) {
             if (!is_array($word_struct)) {
                 continue;
             }
             $word_keys = $word_struct["KEYS"];
             $distinct_word_keys = array();
             $seen_keys = array();
             foreach ($word_keys as $wkey) {
                 if (is_string($wkey) || is_string($wkey[0])) {
                     $tmp_key = is_string($wkey) ? $wkey : $wkey[0];
                     if (!isset($seen_keys[$tmp_key])) {
                         $seen_keys[$tmp_key] = true;
                         $distinct_word_keys[] = $wkey;
                     }
                 } else {
                     $distinct_word_keys[] = $wkey;
                 }
             }
             $quote_positions = $word_struct["QUOTE_POSITIONS"];
             $disallow_keys = $word_struct["DISALLOW_KEYS"];
             $index_name = $word_struct["INDEX_NAME"];
             $weight = $word_struct["WEIGHT"];
             $num_word_keys = count($word_keys);
             $total_iterators = count($distinct_word_keys);
             $word_iterators = array();
             $word_iterator_map = array();
             if ($num_word_keys < 1) {
                 continue;
             }
             $sum = 0;
             for ($i = 0; $i < $total_iterators; $i++) {
                 $current_key = is_string($distinct_word_keys[$i]) ? $distinct_word_keys[$i] : (is_string($distinct_word_keys[$i][0]) ? $distinct_word_keys[$i][0] : $distinct_word_keys[$i][0][0]);
                 if (!is_string($current_key)) {
                     $current_key = $current_key[0];
                 }
                 if (in_array(substr($current_key, 0, 9), $doc_iterate_hashes)) {
                     $word_iterators[$i] = new DocIterator($index_name, $filter, $to_retrieve);
                     $min_group_override = true;
                 } else {
                     //can happen if exact phrase search suffix approach used
                     if (isset($distinct_word_keys[$i][0][0]) && is_array($distinct_word_keys[$i][0][0])) {
                         $distinct_keys = array($distinct_word_keys[$i][0][1]);
                     } else {
                         if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) {
                             $distinct_keys = $distinct_word_keys[$i];
                         } else {
                             $distinct_keys = array($distinct_word_keys[$i]);
                         }
                     }
                     $out_keys = array();
                     $old_distinct_key_id = "";
                     foreach ($distinct_keys as $distinct_key) {
                         if (is_array($distinct_key)) {
                             if (!isset($distinct_key[2]) && isset($distinct_key[1])) {
                                 $distinct_keys[] = $distinct_key[1];
                             }
                             $shift = isset($distinct_key[1]) ? $distinct_key[1] : 0;
                             $mask = isset($distinct_key[2]) ? $distinct_key[2] : "" . "";
                             if (isset($distinct_key[3])) {
                                 $old_distinct_key_id = unbase64Hash($distinct_key[3]);
                             }
                             $distinct_key_id = unbase64Hash($distinct_key[0]);
                         } else {
                             $shift = 0;
                             $mask = "" . "";
                             $distinct_key_id = unbase64Hash($distinct_key);
                         }
                         $lookup_cutoff = max(MIN_RESULTS_TO_GROUP, $to_retrieve);
                         $info = IndexManager::getWordInfo($index_name, $distinct_key_id, $shift, $mask);
                         if ($old_distinct_key_id != "") {
                             $old_info = IndexManager::getWordInfo($index_name, $old_distinct_key_id, $shift, $mask);
                             if ($info !== false && $old_info !== false) {
                                 $info = array_merge($info, $old_info);
                             } else {
                                 if ($old_info !== false) {
                                     $info = $old_info;
                                 }
                             }
                         }
                         if ($info != array()) {
                             $tmp_keys = arrayColumnCount($info, 4, 3);
                             $sum += array_sum($tmp_keys);
                             $out_keys = array_merge($out_keys, $tmp_keys);
                         }
                         if ($sum > $lookup_cutoff) {
                             break;
                         }
                     }
                     arsort($out_keys);
                     $out_keys = array_keys(array_slice($out_keys, 0, 50));
                     $tmp_word_iterators = array();
                     $m = 0;
                     foreach ($out_keys as $distinct_key) {
                         $tmp_word_iterators[$m] = new WordIterator($distinct_key, $index_name, true, $filter, $to_retrieve, $limit_news);
                         if ($tmp_word_iterators[$m]->dictionary_info != array() || $tmp_word_iterators[$m]->feed_count > 0) {
                             $min_group_override = true;
                             $m++;
                         } else {
                             unset($tmp_word_iterators[$m]);
                         }
                     }
                     if ($m == 1) {
                         $word_iterators[$i] = $tmp_word_iterators[0];
                     } else {
                         $word_iterators[$i] = new DisjointIterator($tmp_word_iterators);
                     }
                 }
                 foreach ($word_keys as $index => $key) {
                     if (isset($distinct_word_keys[$i]) && $key == $distinct_word_keys[$i]) {
                         $word_iterator_map[$index] = $i;
                     }
                 }
             }
             $num_disallow_keys = count($disallow_keys);
             if ($num_disallow_keys > 0) {
                 for ($i = 0; $i < $num_disallow_keys; $i++) {
                     $disallow_iterator = new WordIterator($disallow_keys[$i], $index_name, false, $filter);
                     $word_iterators[$num_word_keys + $i] = new NegationIterator($disallow_iterator);
                 }
             }
             $num_word_keys += $num_disallow_keys;
             if ($num_word_keys == 1 && $weight == 1) {
                 $base_iterator = $word_iterators[0];
             } else {
                 $base_iterator = new IntersectIterator($word_iterators, $word_iterator_map, $quote_positions, $weight);
                 $min_group_flag = true;
                 if ($save_timestamp_name == "") {
                     $base_iterator->sync_timer_on = true;
                 } else {
                     $base_iterator->sync_timer_on = false;
                 }
             }
             if ($save_timestamp_name != "") {
                 if (isset($save_point[$save_count]) && $save_point[$save_count] != -1) {
                     $base_iterator->advance($save_point[$save_count]);
                 }
                 $save_count++;
             }
             $iterators[] = $base_iterator;
         }
     }
     $num_iterators = count($iterators);
     //if network_flag should be 1
     if ($num_iterators < 1) {
         return NULL;
     } else {
         if ($num_iterators == 1) {
             $union_iterator = $iterators[0];
         } else {
             $union_iterator = new UnionIterator($iterators);
         }
     }
     $raw = intval($raw);
     if ($raw > 0) {
         $group_iterator = $union_iterator;
     } else {
         $group_iterator = new GroupIterator($union_iterator, $total_iterators, $this->current_machine, $network_flag);
     }
     if ($network_flag) {
         $union_iterator->results_per_block = ceil(SERVER_ALPHA * $group_iterator->results_per_block / $num_servers);
     } else {
         if ($save_timestamp_name != "") {
             $group_iterator->save_iterators = $iterators;
         } else {
             if ($min_group_flag && !$min_group_override) {
                 $group_iterator->results_per_block = max(MIN_RESULTS_TO_GROUP / 20, 1);
                 $to_retrieve = -1;
             }
         }
     }
     return $group_iterator;
 }
Beispiel #24
0
    /**
     * Responsible for handling admin request related to the configure activity
     *
     * The configure activity allows a user to set the work directory for
     * storing data local to this SeekQuarry/Yioop instance. It also allows one
     * to set the default language of the installation, dbms info, robot info,
     * test info, as well as which machine acts as the queue server.
     *
     * @return array $data fields for available language, dbms, etc as well as
     *     results of processing sub activity if any
     */
    function configure()
    {
        $parent = $this->parent;
        $profile_model = $parent->model("profile");
        $group_model = $parent->model("group");
        $data = array();
        $profile = array();
        $data['SYSTEM_CHECK'] = $this->systemCheck();
        $languages = $parent->model("locale")->getLocaleList();
        foreach ($languages as $language) {
            $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME'];
        }
        if (isset($_REQUEST['lang']) && $_REQUEST['lang']) {
            $data['lang'] = $parent->clean($_REQUEST['lang'], "string");
            $profile['DEFAULT_LOCALE'] = $data['lang'];
            setLocaleObject($data['lang']);
        }
        $data["ELEMENT"] = "configure";
        $data['SCRIPT'] = "";
        $data['PROFILE'] = false;
        if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) {
            if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) {
                $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $_REQUEST['arg'] = "directory";
                @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php");
            }
            $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string");
            $data['PROFILE'] = true;
            if (strstr(PHP_OS, "WIN")) {
                //convert to forward slashes so consistent with rest of code
                $dir = str_replace("\\", "/", $dir);
                if ($dir[0] != "/" && $dir[1] != ":") {
                    $data['PROFILE'] = false;
                }
            } else {
                if ($dir[0] != "/") {
                    $data['PROFILE'] = false;
                }
            }
            if ($data['PROFILE'] == false) {
                $data["MESSAGE"] = tl('system_component_configure_use_absolute_path');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            if (strstr($dir . "/", BASE_DIR . "/")) {
                $data['PROFILE'] = false;
                $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            $data['WORK_DIRECTORY'] = $dir;
        } else {
            if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) {
                $data['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $data['PROFILE'] = true;
            }
        }
        $arg = "";
        if (isset($_REQUEST['arg'])) {
            $arg = $_REQUEST['arg'];
        }
        switch ($arg) {
            case "directory":
                if (!isset($data['WORK_DIRECTORY'])) {
                    break;
                }
                if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                    $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                    $data["MESSAGE"] = tl('system_component_configure_work_dir_set');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);";
                } else {
                    if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) {
                        if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) {
                            $profile['DBMS'] = 'sqlite3';
                            $data['DBMS'] = 'sqlite3';
                            $profile['DB_NAME'] = 'default';
                            $data['DB_NAME'] = 'default';
                            $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot');
                            $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT'];
                            $uri = UrlParser::getPath($_SERVER['REQUEST_URI']);
                            $http = isset($_SERVER['HTTPS']) ? "https://" : "http://";
                            $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri;
                            $data['NAME_SERVER'] = $profile['NAME_SERVER'];
                            $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time());
                            $data['AUTH_KEY'] = $profile['AUTH_KEY'];
                            $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus();
                            $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time();
                            $profile['ROBOT_INSTANCE'] = $robot_instance;
                            $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE'];
                            if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) {
                                if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) {
                                    $data["MESSAGE"] = tl('system_component_configure_work_profile_made');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                                    $data['PROFILE'] = true;
                                } else {
                                    $data['PROFILE'] = false;
                                    $data["MESSAGE"] = tl('system_component_configure_no_set_config');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                }
                            } else {
                                $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                                $data['PROFILE'] = false;
                                $data["MESSAGE"] = tl('system_component_configure_no_create_profile');
                                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);";
                            }
                        } else {
                            $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                            $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                            $data['PROFILE'] = false;
                        }
                    } else {
                        $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                        $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                        $data['PROFILE'] = false;
                    }
                }
                break;
            case "profile":
                $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE'));
                $data['DEBUG_LEVEL'] = 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0;
                $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL'];
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                $folder = APP_DIR . "/resources";
                if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) {
                    $data["MESSAGE"] = tl('system_component_no_resource_folder');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                    return $data;
                }
                foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) {
                    if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") {
                        if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') {
                            $data["MESSAGE"] = tl('system_component_invalid_filetype');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        if ($_FILES[$field]['size'] > THUMB_SIZE) {
                            $data["MESSAGE"] = tl('system_component_file_too_big');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        $profile[$field] = array();
                        $profile[$field]['name'] = $_FILES[$field]['name'];
                        $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name'];
                        $data[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $profile[$field]['name'];
                    }
                }
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) {
                    $data['MESSAGE'] = tl('system_component_configure_profile_change');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                    if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) {
                        $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&amp;a=configure&amp;" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);";
                    }
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            case "reset":
                $base_url = NAME_SERVER;
                if (defined("BASE_URL")) {
                    $base_url = BASE_URL;
                }
                $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => "");
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                foreach ($old_profile as $key => $value) {
                    $data[$key] = $value;
                }
                $tmp_image = $old_profile['BACKGROUND_IMAGE'];
                $old_profile['BACKGROUND_IMAGE'] = "";
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) {
                    $old_profile['BACKGROUND_IMAGE'] = $tmp_image;
                    foreach ($profile as $key => $value) {
                        $data[$key] = $value;
                        if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") {
                            $resource_name = APP_DIR . "/resources/" . $old_profile[$key];
                            if (file_exists($resource_name)) {
                                unlink($resource_name);
                            }
                        }
                    }
                    $data['MESSAGE'] = tl('system_component_configure_reset_completed');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            default:
                if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                } else {
                    $data['WORK_DIRECTORY'] = "";
                    $data['PROFILE'] = false;
                }
        }
        $data['advanced'] = "false";
        if ($data['PROFILE']) {
            $locale_tag = getLocaleTag();
            $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN");
            foreach ($not_null_fields as $field => $default) {
                if (!$data[$field]) {
                    $data[$field] = $default;
                }
            }
            if (isset($_REQUEST['ROBOT_DESCRIPTION'])) {
                $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN);
                $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", "");
            }
            $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit");
            $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot');
            if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') {
                $data['advanced'] = "true";
            }
            $data['SCRIPT'] .= <<<EOD
    setDisplay('advance-configure', {$data['advanced']});
    setDisplay('advance-robot', {$data['advanced']});
    function toggleAdvance() {
        var advanced = elt('a-settings');
        advanced.value = (advanced.value =='true')
            ? 'false' : 'true';
        var value = (advanced.value == 'true') ? true : false;
        setDisplay('advance-configure', value);
        setDisplay('advance-robot', value);
    }
EOD;
        }
        $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n";
        return $data;
    }
Beispiel #25
0
 /**
  * Returns the path to a user's resource folder (where uploaded files
  * will be stored). It creates the folder if it does not exist
  *
  * @param int $user_id user id of user to get path for
  */
 function getUserIconFolder($user_id)
 {
     $user_folder = crawlHash("user" . $user_id . AUTH_KEY);
     $user_prefix = substr($user_folder, 0, 3);
     $resource_path = APP_DIR . "/resources";
     $prefix_path = $resource_path . "/{$user_prefix}";
     $user_path = "{$prefix_path}/{$user_folder}";
     if (file_exists($user_path)) {
         return $user_path;
     }
     if (!file_exists(APP_DIR) && !mkdir(APP_DIR)) {
         return false;
     }
     if (!file_exists($resource_path) && !mkdir($resource_path)) {
         return false;
     }
     if (!file_exists($prefix_path) && !mkdir($prefix_path)) {
         return false;
     }
     if (mkdir($user_path)) {
         return $user_path;
     }
     return false;
 }
Beispiel #26
0
 /**
  * Function to check that keys are successfully deleted from the B-Tree
  * Random key-value pairs are firs inserted in the B-Tree. From the inserted
  * key-value pairs, key-value pairs are randomly selected and deleted from
  * the B-Tree. The deleted key-value pairs are then looked up using their
  * keys to check if they were successfully deleted.
  */
 function deleteLookupTestCase()
 {
     //Insert values
     $key_value_pairs = array();
     for ($i = 1; $i <= NUM_VALS; $i++) {
         $value = crawlHash(rand(1, 1000), true);
         $key = crawlHash($value, true);
         $this->test_objects['FILE1']->insert(array($key, $value));
         $key_value_pairs[] = array($key, $key);
     }
     //Delete Values
     $deleted = array();
     for ($i = 1; $i <= NUM_VALS; $i++) {
         $index = mt_rand(0, NUM_VALS - 1);
         $key = $key_value_pairs[$index][0];
         $this->test_objects['FILE1']->remove($key);
         $deleted[] = $key;
     }
     //Lookup values
     foreach ($deleted as $deleted_key) {
         $this->assertEqual(NULL, $this->test_objects['FILE1']->findValue($deleted_key), 'Deleted Value not found');
     }
 }
Beispiel #27
0
 /**
  * Handles the request to get the  array of news feed sources which hash to
  * a particular value i.e. match with the index of requesting machine's
  * hashed url/name from array of available machines hash
  */
 function getNewsSources()
 {
     if (!isset($_REQUEST["arg"])) {
         return;
     }
     $source_model = $this->model("source");
     $current_machine = $this->clean(webdecode($_REQUEST["arg"]), "string");
     $machine_hashes = $source_model->getMachineHashUrls();
     $machine_index_match = array_search($current_machine, $machine_hashes);
     if ($machine_index_match === false) {
         echo webencode(serialize(array()));
         return;
     }
     $num_machines = count($machine_hashes);
     $pre_feeds = $source_model->getMediaSources("rss");
     $pre_feeds = array_merge($pre_feeds, $source_model->getMediaSources("html"));
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $hash_int = unpack("N", crawlHash($pre_feed['NAME']));
         if (!isset($hash_int[1])) {
             continue;
         }
         $hash_index = $hash_int[1] % $num_machines;
         if ($machine_index_match != $hash_index) {
             continue;
         }
         if ($pre_feed['TYPE'] == 'html') {
             list($pre_feed['CHANNEL_PATH'], $pre_feed['ITEM_PATH'], $pre_feed['TITLE_PATH'], $pre_feed['DESCRIPTION_PATH'], $pre_feed['LINK_PATH']) = explode("###", html_entity_decode($pre_feed['AUX_INFO']));
         }
         $feeds[] = $pre_feed;
     }
     echo webencode(serialize($feeds));
 }