コード例 #1
0
ファイル: recipe_plugin.php プロジェクト: yakar/yioop
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
コード例 #2
0
ファイル: url_parser.php プロジェクト: yakar/yioop
 /**
  * Checks if a $url is on localhost
  *
  * @param string $url the url to check
  * @return bool whether or not it is on localhost
  */
 static function isLocalhostUrl($url)
 {
     $host = UrlParser::getHost($url, false);
     $localhosts = array("localhost", "127.0.0.1", "::1");
     if (isset($_SERVER["SERVER_NAME"])) {
         $localhosts[] = $_SERVER["SERVER_NAME"];
         $localhosts[] = gethostbyname($_SERVER["SERVER_NAME"]);
     }
     if (isset($_SERVER["SERVER_ADDR"])) {
         $localhosts[] = $_SERVER["SERVER_ADDR"];
     }
     foreach ($localhosts as $localhost) {
         if (stristr($host, $localhost)) {
             return true;
         }
     }
     return false;
 }
コード例 #3
0
ファイル: queue_server.php プロジェクト: yakar/yioop
 /**
  * Produces a schedule.txt file of url data for a fetcher to crawl next.
  *
  * The hard part of scheduling is to make sure that the overall crawl
  * process obeys robots.txt files. This involves checking the url is in
  * an allowed path for that host and it also involves making sure the
  * Crawl-delay directive is respected. The first fetcher that contacts the
  * server requesting data to crawl will get the schedule.txt
  * produced by produceFetchBatch() at which point it will be unlinked
  * (these latter thing are controlled in FetchController).
  *
  * @see FetchController
  */
 function produceFetchBatch()
 {
     $i = 1;
     // array implementation of priority queue starts at 1 not 0
     $fetch_size = 0;
     crawlLog("Scheduler: Start Produce Fetch Batch Memory usage" . memory_get_usage());
     $count = $this->web_queue->to_crawl_queue->count;
     $schedule_time = time();
     $first_line = $this->calculateScheduleMetaInfo($schedule_time);
     $sites = array();
     $delete_urls = array();
     $crawl_delay_hosts = array();
     $time_per_request_guess = MINIMUM_FETCH_LOOP_TIME;
     // it would be impressive if we can achieve this speed
     $current_crawl_index = -1;
     crawlLog("Scheduler: Trying to Produce Fetch Batch; Queue Size {$count}");
     $start_time = microtime();
     $fh = $this->web_queue->openUrlArchive();
     /*
         $delete - array of items we will delete from the queue after
             we have selected all of the items for fetch batch
         $sites - array of urls for fetch batch indices in this array we'll
             call slots. Crawled-delayed host urls are spaced by a certain
             number of slots
     */
     while ($i <= $count && $fetch_size < MAX_FETCH_SIZE) {
         crawlTimeoutLog("..Scheduler: still producing fetch batch. " . "Examining location %s in queue of %s.", $i, $count);
         //look in queue for url and its weight
         $tmp = $this->web_queue->peekQueue($i, $fh);
         list($url, $weight, $flag, $probe) = $tmp;
         // if queue error remove entry any loop
         if ($tmp === false || strcmp($url, "LOOKUP ERROR") == 0) {
             $delete_urls[$i] = false;
             crawlLog("Scheduler: Removing lookup error at" . " {$i} during produce fetch");
             $i++;
             continue;
         }
         $no_flags = false;
         $hard_coded = false;
         $host_url = UrlParser::getHost($url);
         if ($flag == WebQueueBundle::NO_FLAGS) {
             $hard_coded_pos = strpos($url, "###!");
             if ($hard_coded_pos > 0) {
                 $has_robots = true;
                 $hard_coded = true;
                 $is_robot = false;
             } else {
                 $has_robots = $this->web_queue->containsGotRobotTxt($host_url);
                 $scheme = UrlParser::getScheme($host_url);
                 if ($scheme == "gopher") {
                     $is_robot = strcmp($host_url . "/0/robots.txt", $url) == 0;
                 } else {
                     $is_robot = strcmp($host_url . "/robots.txt", $url) == 0;
                 }
             }
             $no_flags = true;
         } else {
             $is_robot = $flag == WebQueueBundle::ROBOT;
             if ($flag >= WebQueueBundle::SCHEDULABLE) {
                 $has_robots = true;
                 if ($flag > WebQueueBundle::SCHEDULABLE) {
                     $delay = $flag - WebQueueBundle::SCHEDULABLE;
                 }
             }
         }
         //if $url is a robots.txt url see if we need to schedule or not
         if ($is_robot) {
             if ($has_robots) {
                 $delete_urls[$i] = $url;
                 $i++;
             } else {
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     /* note don't add to seen url filter
                          since check robots every 24 hours as needed
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                     $i++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::ROBOT);
                     }
                 }
             }
             continue;
         }
         //Now handle the non-robots.txt url case
         $robots_okay = true;
         if ($has_robots) {
             if ($no_flags) {
                 if (!isset($hard_coded) || !$hard_coded) {
                     $robots_okay = $this->web_queue->checkRobotOkay($url);
                 } else {
                     $robots_okay = true;
                 }
                 if (!$robots_okay) {
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     $i++;
                     continue;
                 }
                 $delay = $this->web_queue->getCrawlDelay($host_url);
             }
             if (!$this->withinQuota($url)) {
                 //we've not allowed to schedule $url till next hour
                 $delete_urls[$i] = $url;
                 //delete from queue (so no clog) but don't mark seen
                 $i++;
                 continue;
             }
             //each host has two entries in $this->waiting_hosts
             $num_waiting = floor(count($this->waiting_hosts) / 2);
             if ($delay > 0) {
                 // handle adding a url if there is a crawl delay
                 $hash_host = crawlHash($host_url);
                 $is_waiting_host = isset($this->waiting_hosts[$hash_host]);
                 if (!$is_waiting_host && $num_waiting < MAX_WAITING_HOSTS || $is_waiting_host && $this->waiting_hosts[$hash_host] == $schedule_time) {
                     $this->waiting_hosts[$hash_host] = $schedule_time;
                     $this->waiting_hosts[$schedule_time][] = $hash_host;
                     $request_batches_per_delay = ceil($delay / $time_per_request_guess);
                     if (!isset($crawl_delay_hosts[$hash_host])) {
                         $next_earliest_slot = $current_crawl_index;
                         $crawl_delay_hosts[$hash_host] = $next_earliest_slot;
                     } else {
                         $next_earliest_slot = $crawl_delay_hosts[$hash_host] + $request_batches_per_delay * NUM_MULTI_CURL_PAGES;
                     }
                     if (($next_slot = $this->getEarliestSlot($next_earliest_slot, $sites)) < MAX_FETCH_SIZE) {
                         $crawl_delay_hosts[$hash_host] = $next_slot;
                         $delete_urls[$i] = $url;
                         $sites[$next_slot] = array($url, $weight, $delay);
                         $this->web_queue->addSeenUrlFilter($url);
                         /* we might miss some sites by marking them
                              seen after only scheduling them
                            */
                         $fetch_size++;
                     } else {
                         if ($no_flags) {
                             $this->web_queue->setQueueFlag($url, $delay + WebQueueBundle::SCHEDULABLE);
                         }
                     }
                 } else {
                     if (!$is_waiting_host) {
                         // has crawl delay but too many already waiting
                         $delete_urls[$i] = $url;
                         //delete from queue (so no clog) but don't mark seen
                         $i++;
                         continue;
                     }
                 }
             } else {
                 // add a url no crawl delay
                 $next_slot = $this->getEarliestSlot($current_crawl_index, $sites);
                 if ($next_slot < MAX_FETCH_SIZE) {
                     $sites[$next_slot] = array($url, $weight, 0);
                     $delete_urls[$i] = $url;
                     $this->web_queue->addSeenUrlFilter($url);
                     /* we might miss some sites by marking them
                          seen after only scheduling them
                        */
                     $current_crawl_index = $next_slot;
                     $fetch_size++;
                 } else {
                     //no more available slots so prepare to bail
                     $i = $count;
                     if ($no_flags) {
                         $this->web_queue->setQueueFlag($url, WebQueueBundle::SCHEDULABLE);
                     }
                 }
             }
             //if delay else
         }
         // if containsGotRobotTxt
         // handle robots.txt urls
         $i++;
     }
     //end while
     $this->web_queue->closeUrlArchive($fh);
     $new_time = microtime();
     crawlLog("...Scheduler: Done selecting URLS for fetch batch time " . "so far:" . changeInMicrotime($start_time));
     $num_deletes = count($delete_urls);
     $k = 0;
     foreach ($delete_urls as $delete_url) {
         $k++;
         crawlTimeoutLog("..Removing selected url %s of %s " . "from queue.", $k, $num_deletes);
         if ($delete_url) {
             $this->web_queue->removeQueue($delete_url);
         } else {
             /*  if there was a hash table look up error still get rid of
                 index from priority queue */
             $this->web_queue->to_crawl_queue->poll($k);
         }
     }
     crawlLog("...Scheduler: Removed {$k} URLS for fetch batch from " . "queue in time: " . changeInMicrotime($new_time));
     $new_time = microtime();
     if (isset($sites) && count($sites) > 0) {
         $dummy_slot = array(self::DUMMY, 0.0, 0);
         /* dummy's are used for crawl delays of sites with longer delays
              when we don't have much else to crawl.
            */
         $cnt = 0;
         for ($j = 0; $j < MAX_FETCH_SIZE; $j++) {
             if (isset($sites[$j])) {
                 $cnt++;
                 if ($cnt == $fetch_size) {
                     break;
                 }
             } else {
                 if ($j % NUM_MULTI_CURL_PAGES == 0) {
                     $sites[$j] = $dummy_slot;
                 }
             }
         }
         ksort($sites);
         //write schedule to disk
         $fh = fopen(CRAWL_DIR . "/schedules/" . self::schedule_name . $this->crawl_time . ".txt", "wb");
         fwrite($fh, $first_line);
         $num_sites = count($sites);
         $k = 0;
         foreach ($sites as $site) {
             crawlTimeoutLog("..Scheduler: Still Writing fetch schedule %s" . " of %s.", $k, $num_sites);
             $k++;
             $extracted_etag = NULL;
             list($url, $weight, $delay) = $site;
             $key = crawlHash($url, true);
             if (USE_ETAG_EXPIRES) {
                 /*check if we have cache validation data for a URL. If both
                    ETag and Expires timestamp are found or only an expires
                    timestamp is found, the timestamp is compared with the current
                    time. If the current time is less than the expires timestamp,
                    the URL is not added to the fetch batch. If only an ETag is
                    found, the ETag is appended to the URL so that it can be
                    processed by the fetcher.
                   */
                 $value = $this->web_queue->etag_btree->findValue($key);
                 if ($value !== NULL) {
                     $cache_validation_data = $value[1];
                     if ($cache_validation_data['etag'] !== -1 && $cache_validation_data['expires'] !== -1) {
                         $expires_timestamp = $cache_validation_data['expires'];
                         $current_time = time();
                         if ($current_time < $expires_timestamp) {
                             continue;
                         } else {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         }
                     } else {
                         if ($cache_validation_data['etag'] !== -1) {
                             $etag = $cache_validation_data['etag'];
                             $extracted_etag = "ETag: " . $etag;
                         } else {
                             if ($cache_validation_data['expires'] !== -1) {
                                 $expires_timestamp = $cache_validation_data['expires'];
                                 $current_time = time();
                                 if ($current_time < $expires_timestamp) {
                                     continue;
                                 }
                             }
                         }
                     }
                 }
             }
             $host_url = UrlParser::getHost($url);
             $dns_lookup = $this->web_queue->dnsLookup($host_url);
             if ($dns_lookup) {
                 $url .= "###" . urlencode($dns_lookup);
             }
             if ($extracted_etag !== NULL) {
                 $url .= $extracted_etag;
             }
             $out_string = base64_encode(packFloat($weight) . packInt($delay) . $url) . "\n";
             fwrite($fh, $out_string);
         }
         fclose($fh);
         crawlLog("...Scheduler: Sort URLS and write schedule time: " . changeInMicrotime($new_time));
         crawlLog("Scheduler: End Produce Fetch Batch Memory usage" . memory_get_usage());
         crawlLog("Scheduler: Created fetch batch of size {$num_sites}." . " {$num_deletes} urls were deleted." . " Queue size is now " . $this->web_queue->to_crawl_queue->count . "...Total Time to create batch: " . changeInMicrotime($start_time));
     } else {
         crawlLog("Scheduler: No fetch batch created!! " . "Time failing to make a fetch batch:" . changeInMicrotime($start_time) . ". Loop properties:{$i} {$count}" . " {$num_deletes} urls were deleted in failed attempt.");
         $max_links = max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP);
         if ($num_deletes < 5 && $i >= $count && $count >= NUM_URLS_QUEUE_RAM - SEEN_URLS_BEFORE_UPDATE_SCHEDULER * $max_links) {
             crawlLog("Scheduler: Queue Full and Couldn't produce Fetch " . "Batch!! Or Delete any URLS!!!");
             crawlLog("Scheduler: Rescheduling Queue Contents " . "(not marking seen) to try to unjam!");
             $this->dumpQueueToSchedules(true);
             $this->clearWebQueue();
         }
     }
 }
コード例 #4
0
ファイル: crawl_component.php プロジェクト: yakar/yioop
 /**
  * Handles admin request related to the search filter activity
  *
  * This activity allows a user to specify hosts whose web pages are to be
  * filtered out the search results
  *
  * @return array $data info about the groups and their contents for a
  *     particular crawl mix
  */
 function resultsEditor()
 {
     $parent = $this->parent;
     $filters_model = $parent->model("searchfilters");
     $data["ELEMENT"] = "resultseditor";
     $data['SCRIPT'] = "";
     if (isset($_REQUEST['disallowed_sites'])) {
         $sites = $parent->convertStringCleanArray($_REQUEST['disallowed_sites']);
         $disallowed_sites = array();
         foreach ($sites as $site) {
             $site = UrlParser::getHost($site);
             if (strlen($site) > 0) {
                 $disallowed_sites[] = $site . "/";
             }
         }
         $data['disallowed_sites'] = implode("\n", $disallowed_sites);
         $filters_model->set($disallowed_sites);
         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_update') . "</h1>')";
     }
     if (!isset($data['disallowed_sites'])) {
         $data['disallowed_sites'] = implode("\n", $filters_model->getUrls());
     }
     foreach (array("URL", "TITLE", "DESCRIPTION") as $field) {
         $data[$field] = isset($_REQUEST[$field]) ? $parent->clean($_REQUEST[$field], "string") : (isset($data[$field]) ? $data[$field] : "");
     }
     if ($data["URL"] != "") {
         $data["URL"] = UrlParser::canonicalLink($data["URL"], "");
     }
     $tmp = tl('crawl_component_edited_pages');
     $data["URL_LIST"] = array($tmp => $tmp);
     $summaries = $filters_model->getEditedPageSummaries();
     foreach ($summaries as $hash => $summary) {
         $data["URL_LIST"][$summary[self::URL]] = $summary[self::URL];
     }
     if (isset($_REQUEST['arg'])) {
         switch ($_REQUEST['arg']) {
             case "save_page":
                 $missing_page_field = $data["URL"] == "" ? true : false;
                 if ($missing_page_field) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_need_url') . "</h1>')";
                 } else {
                     $filters_model->updateResultPage($data["URL"], $data["TITLE"], $data["DESCRIPTION"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_page_updated') . "</h1>')";
                 }
                 break;
             case "load_url":
                 $hash_url = crawlHash($_REQUEST['LOAD_URL'], true);
                 if (isset($summaries[$hash_url])) {
                     $data["URL"] = $parent->clean($_REQUEST['LOAD_URL'], "string");
                     $data["TITLE"] = $summaries[$hash_url][self::TITLE];
                     $data["DESCRIPTION"] = $summaries[$hash_url][self::DESCRIPTION];
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_results_editor_page_loaded') . "</h1>')";
                 }
                 break;
         }
     }
     return $data;
 }
コード例 #5
0
ファイル: fetcher.php プロジェクト: yakar/yioop
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
コード例 #6
0
ファイル: phrase_parser.php プロジェクト: yakar/yioop
 /**
  * Calculates the meta words to be associated with a given downloaded
  * document. These words will be associated with the document in the
  * index for (server:apache) even if the document itself did not contain
  * them.
  *
  * @param array& $site associated array containing info about a downloaded
  *     (or read from archive) document.
  * @param array $video_sources used to check if a page should be marked as
  *      having meta media:video
  * @return array of meta words to be associate with this document
  */
 static function calculateMetas(&$site, $video_sources = array())
 {
     $meta_ids = array();
     // handles user added meta words
     if (isset($site[CrawlConstants::META_WORDS])) {
         $meta_ids = $site[CrawlConstants::META_WORDS];
     }
     /*
         Handle the built-in meta words. For example
         store the sites the doc_key belongs to,
         so you can search by site
     */
     $url_sites = UrlParser::getHostPaths($site[CrawlConstants::URL]);
     $url_sites = array_merge($url_sites, UrlParser::getHostSubdomains($site[CrawlConstants::URL]));
     $meta_ids[] = 'site:all';
     foreach ($url_sites as $url_site) {
         if (strlen($url_site) > 0) {
             $meta_ids[] = 'site:' . $url_site;
         }
     }
     $path = UrlParser::getPath($site[CrawlConstants::URL]);
     if (strlen($path) > 0) {
         $path_parts = explode("/", $path);
         $pre_path = "";
         $meta_ids[] = 'path:all';
         $meta_ids[] = 'path:/';
         foreach ($path_parts as $part) {
             if (strlen($part) > 0) {
                 $pre_path .= "/{$part}";
                 $meta_ids[] = 'path:' . $pre_path;
             }
         }
     }
     $meta_ids[] = 'info:' . $site[CrawlConstants::URL];
     $meta_ids[] = 'info:' . crawlHash($site[CrawlConstants::URL]);
     $meta_ids[] = 'code:all';
     $meta_ids[] = 'code:' . $site[CrawlConstants::HTTP_CODE];
     if (UrlParser::getHost($site[CrawlConstants::URL]) . "/" == $site[CrawlConstants::URL]) {
         $meta_ids[] = 'host:all';
         //used to count number of distinct hosts
     }
     if (isset($site[CrawlConstants::SIZE])) {
         $meta_ids[] = "size:all";
         $interval = DOWNLOAD_SIZE_INTERVAL;
         $size = floor($site[CrawlConstants::SIZE] / $interval) * $interval;
         $meta_ids[] = "size:{$size}";
     }
     if (isset($site[CrawlConstants::TOTAL_TIME])) {
         $meta_ids[] = "time:all";
         $interval = DOWNLOAD_TIME_INTERVAL;
         $time = floor($site[CrawlConstants::TOTAL_TIME] / $interval) * $interval;
         $meta_ids[] = "time:{$time}";
     }
     if (isset($site[CrawlConstants::DNS_TIME])) {
         $meta_ids[] = "dns:all";
         $interval = DOWNLOAD_TIME_INTERVAL;
         $time = floor($site[CrawlConstants::DNS_TIME] / $interval) * $interval;
         $meta_ids[] = "dns:{$time}";
     }
     if (isset($site[CrawlConstants::LINKS])) {
         $num_links = count($site[CrawlConstants::LINKS]);
         $meta_ids[] = "numlinks:all";
         $meta_ids[] = "numlinks:{$num_links}";
         $link_urls = array_keys($site[CrawlConstants::LINKS]);
         $meta_ids[] = "link:all";
         foreach ($link_urls as $url) {
             $meta_ids[] = 'link:' . $url;
             $meta_ids[] = 'link:' . crawlHash($url);
         }
     }
     if (isset($site[CrawlConstants::LOCATION]) && is_array($site[CrawlConstants::LOCATION])) {
         foreach ($site[CrawlConstants::LOCATION] as $location) {
             $meta_ids[] = 'info:' . $location;
             $meta_ids[] = 'info:' . crawlHash($location);
             $meta_ids[] = 'location:all';
             $meta_ids[] = 'location:' . $location;
         }
     }
     if (isset($site[CrawlConstants::IP_ADDRESSES])) {
         $meta_ids[] = 'ip:all';
         foreach ($site[CrawlConstants::IP_ADDRESSES] as $address) {
             $meta_ids[] = 'ip:' . $address;
         }
     }
     $meta_ids[] = 'media:all';
     if ($video_sources != array()) {
         if (UrlParser::isVideoUrl($site[CrawlConstants::URL], $video_sources)) {
             $meta_ids[] = "media:video";
         } else {
             $meta_ids[] = stripos($site[CrawlConstants::TYPE], "image") !== false ? 'media:image' : 'media:text';
         }
     }
     // store the filetype info
     $url_type = UrlParser::getDocumentType($site[CrawlConstants::URL]);
     if (strlen($url_type) > 0) {
         $meta_ids[] = 'filetype:all';
         $meta_ids[] = 'filetype:' . $url_type;
     }
     if (isset($site[CrawlConstants::SERVER])) {
         $meta_ids[] = 'server:all';
         $meta_ids[] = 'server:' . strtolower($site[CrawlConstants::SERVER]);
     }
     if (isset($site[CrawlConstants::SERVER_VERSION])) {
         $meta_ids[] = 'version:all';
         $meta_ids[] = 'version:' . $site[CrawlConstants::SERVER_VERSION];
     }
     if (isset($site[CrawlConstants::OPERATING_SYSTEM])) {
         $meta_ids[] = 'os:all';
         $meta_ids[] = 'os:' . strtolower($site[CrawlConstants::OPERATING_SYSTEM]);
     }
     if (isset($site[CrawlConstants::MODIFIED])) {
         $modified = $site[CrawlConstants::MODIFIED];
         $meta_ids[] = 'modified:all';
         $meta_ids[] = 'modified:' . date('Y', $modified);
         $meta_ids[] = 'modified:' . date('Y-m', $modified);
         $meta_ids[] = 'modified:' . date('Y-m-d', $modified);
     }
     if (isset($site[CrawlConstants::TIMESTAMP])) {
         $date = $site[CrawlConstants::TIMESTAMP];
         $meta_ids[] = 'date:all';
         $meta_ids[] = 'date:' . date('Y', $date);
         $meta_ids[] = 'date:' . date('Y-m', $date);
         $meta_ids[] = 'date:' . date('Y-m-d', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date);
         $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date);
     }
     if (isset($site[CrawlConstants::LANG])) {
         $meta_ids[] = 'lang:all';
         $lang_parts = explode("-", $site[CrawlConstants::LANG]);
         $meta_ids[] = 'lang:' . $lang_parts[0];
         if (isset($lang_parts[1])) {
             $meta_ids[] = 'lang:' . $site[CrawlConstants::LANG];
         }
     }
     if (isset($site[CrawlConstants::AGENT_LIST])) {
         foreach ($site[CrawlConstants::AGENT_LIST] as $agent) {
             $meta_ids[] = 'robot:' . strtolower($agent);
         }
     }
     //Add all meta word for subdoctype
     if (isset($site[CrawlConstants::SUBDOCTYPE])) {
         $meta_ids[] = $site[CrawlConstants::SUBDOCTYPE] . ':all';
     }
     return $meta_ids;
 }
コード例 #7
0
ファイル: robot_processor.php プロジェクト: yakar/yioop
 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }
コード例 #8
0
ファイル: register_controller.php プロジェクト: yakar/yioop
 /**
  * Used to handle data from the suggest-a-url to crawl form
  * (suggest_view.php). Basically, it saves any data submitted to
  * a file which can then be imported in manageCrawls
  *
  * @return array $data contains fields with the current value for
  *     the url (if set but not submitted) as well as for a captcha
  */
 function suggestUrl()
 {
     $data["REFRESH"] = "suggest";
     $visitor_model = $this->model("visitor");
     $clear = false;
     if (CAPTCHA_MODE != IMAGE_CAPTCHA) {
         unset($_SESSION["captcha_text"]);
     }
     if (CAPTCHA_MODE != TEXT_CAPTCHA) {
         unset($_SESSION['CAPTCHA']);
         unset($_SESSION['CAPTCHA_ANSWERS']);
     }
     if (CAPTCHA_MODE != HASH_CAPTCHA) {
         $num_captchas = self::NUM_CAPTCHA_QUESTIONS;
         unset($_SESSION["request_time"]);
         unset($_SESSION["level"]);
         unset($_SESSION["random_string"]);
     } else {
         $data['INCLUDE_SCRIPTS'] = array("sha1", "hash_captcha");
     }
     if (!isset($_SESSION['BUILD_TIME']) || !isset($_REQUEST['build_time']) || $_SESSION['BUILD_TIME'] != $_REQUEST['build_time'] || $this->clean($_REQUEST['build_time'], "int") <= 0) {
         if (CAPTCHA_MODE == HASH_CAPTCHA) {
             $time = time();
             $_SESSION["request_time"] = $time;
             $_SESSION["level"] = self::HASH_CAPTCHA_LEVEL;
             $_SESSION["random_string"] = md5($time . AUTH_KEY);
         }
         $clear = true;
         if (isset($_REQUEST['url'])) {
             unset($_REQUEST['url']);
         }
         if (isset($_REQUEST['arg'])) {
             unset($_REQUEST['arg']);
         }
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
     } else {
         $data['build_time'] = $_SESSION['BUILD_TIME'];
     }
     $data['url'] = "";
     if (isset($_REQUEST['url'])) {
         $data['url'] = $this->clean($_REQUEST['url'], "string");
     }
     $missing = array();
     $save = isset($_REQUEST['arg']) && $_REQUEST['arg'];
     if (CAPTCHA_MODE == TEXT_CAPTCHA) {
         for ($i = 0; $i < $num_captchas; $i++) {
             $data["question_{$i}"] = "-1";
             if ($clear && isset($_REQUEST["question_{$i}"])) {
                 unset($_REQUEST["question_{$i}"]);
             }
         }
         if (!isset($_SESSION['CAPTCHA']) || !isset($_SESSION['CAPTCHA_ANSWERS'])) {
             list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
             $data['CAPTCHA'] = $captchas;
             $data['build_time'] = time();
             $_SESSION['BUILD_TIME'] = $data['build_time'];
             $_SESSION['CAPTCHA_ANSWERS'] = $answers;
             $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
         } else {
             $data['CAPTCHA'] = $_SESSION['CAPTCHA'];
         }
         for ($i = 0; $i < $num_captchas; $i++) {
             $field = "question_{$i}";
             $captchas = isset($_SESSION['CAPTCHA'][$i]) ? $_SESSION['CAPTCHA'][$i] : array();
             if ($save) {
                 if (!isset($_REQUEST[$field]) || $_REQUEST[$field] == "-1" || !in_array($_REQUEST[$field], $captchas)) {
                     $missing[] = $field;
                 } else {
                     $data[$field] = $_REQUEST[$field];
                 }
             }
         }
     }
     $data['MISSING'] = $missing;
     $fail = false;
     if (CAPTCHA_MODE == IMAGE_CAPTCHA && !$save) {
         $this->setupGraphicalCaptchaViewData($data);
     }
     if ($save && isset($_REQUEST['url'])) {
         $url = $this->clean($_REQUEST['url'], "string");
         $url_parts = @parse_url($url);
         if (!isset($url_parts['scheme'])) {
             $url = "http://" . $url;
         }
         $suggest_host = UrlParser::getHost($url);
         $scheme = UrlParser::getScheme($url);
         if (strlen($suggest_host) < 12 || !$suggest_host || !in_array($scheme, array("http", "https"))) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_invalid_url') . "</h1>');";
             $fail = true;
         } else {
             if ($missing != array()) {
                 $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_error_fields') . "</h1>');";
                 $fail = true;
             }
         }
         if (CAPTCHA_MODE == IMAGE_CAPTCHA && $fail) {
             $this->setupGraphicalCaptchaViewData($data);
         }
         if ($fail) {
             return $data;
         }
         switch (CAPTCHA_MODE) {
             case HASH_CAPTCHA:
                 if (!$this->validateHashCode()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_hashcode') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 break;
             case TEXT_CAPTCHA:
                 $fail = false;
                 if (!$this->checkCaptchaAnswers()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_human') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     $data['build_time'] = time();
                     $_SESSION['BUILD_TIME'] = $data['build_time'];
                     $fail = true;
                 }
                 for ($i = 0; $i < $num_captchas; $i++) {
                     $data["question_{$i}"] = "-1";
                 }
                 list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
                 $data['CAPTCHA'] = $captchas;
                 $_SESSION['CAPTCHA_ANSWERS'] = $answers;
                 $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
                 if ($fail) {
                     return $data;
                 }
                 break;
             case IMAGE_CAPTCHA:
                 $user_captcha_text = isset($_REQUEST['user_captcha_text']) ? $this->clean($_REQUEST['user_captcha_text'], "string") : "";
                 if (isset($_SESSION['captcha_text']) && $_SESSION['captcha_text'] != trim($user_captcha_text)) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_graphical_human') . "</h1>');";
                     unset($_SESSION['captcha_text']);
                     $this->setupGraphicalCaptchaViewData($data);
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 $this->setupGraphicalCaptchaViewData($data);
                 break;
         }
         // Handle cases where captcha was okay
         if (!$this->model("crawl")->appendSuggestSites($url)) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_suggest_full') . "</h1>');";
             return $data;
         }
         $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_url_submitted') . "</h1>');";
         $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "suggest_day_exceeded", ONE_DAY, ONE_DAY, MAX_SUGGEST_URLS_ONE_DAY);
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
         $data['url'] = "";
     }
     return $data;
 }
コード例 #9
0
ファイル: arc_tool.php プロジェクト: yakar/yioop
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
コード例 #10
0
ファイル: search_controller.php プロジェクト: yakar/yioop
 /**
  * Used to get and render a cached web page
  *
  * @param string $url the url of the page to find the cached version of
  * @param array $ui_flags array of  ui features which
  *     should be added to the cache page. For example, "highlight"
  *     would say search terms should be highlighted, "history"
  *     says add history navigation for all copies of this cache page in
  *     yioop system. "summaries" says add a toggle headers and extracted
  *     summaries link. "cache_link_referrer" says a link on a cache page
  *     referred us to the current cache request
  * @param string $terms from orginal query responsible for cache request
  * @param int $crawl_time the timestamp of the crawl to look up the cached
  *     page in
  */
 function cacheRequestAndOutput($url, $ui_flags = array(), $terms = "", $crawl_time = 0)
 {
     global $CACHE, $IMAGE_TYPES;
     $crawl_model = $this->model("crawl");
     $flag = 0;
     $crawl_item = NULL;
     $all_future_times = array();
     $all_crawl_times = array();
     $all_past_times = array();
     //Check if the URL is from a cached page
     $cached_link = in_array("cache_link_referrer", $ui_flags) ? true : false;
     $hash_key = crawlHash($terms . $url . serialize($ui_flags) . serialize($crawl_time));
     if (USE_CACHE) {
         if ($new_doc = $CACHE->get($hash_key)) {
             echo $new_doc;
             return;
         }
     }
     $queue_servers = $this->model("machine")->getQueueServerUrls();
     if ($crawl_time == 0) {
         $crawl_time = $crawl_model->getCurrentIndexDatabaseName();
     }
     //Get all crawl times
     $crawl_times = array();
     $all_crawl_details = $crawl_model->getCrawlList(false, true, $queue_servers);
     foreach ($all_crawl_details as $crawl_details) {
         if ($crawl_details['CRAWL_TIME'] != 0) {
             array_push($crawl_times, $crawl_details['CRAWL_TIME']);
         }
     }
     for ($i = 0; $i < count($crawl_times); $i++) {
         $crawl_times[$i] = intval($crawl_times[$i]);
     }
     asort($crawl_times);
     //Get int value of current crawl time for comparison
     $crawl_time_int = intval($crawl_time);
     /* Search for all crawl times containing the cached
        version of the page for $url for multiple and single queue servers */
     list($network_crawl_times, $network_crawl_items) = $this->getCrawlItems($url, $crawl_times, $queue_servers);
     $nonnet_crawl_times = array_diff($crawl_times, $network_crawl_times);
     if (count($nonnet_crawl_times) > 0) {
         list($nonnet_crawl_times, $nonnet_crawl_items) = $this->getCrawlItems($url, $nonnet_crawl_times, NULL);
     } else {
         $nonnet_crawl_items = array();
     }
     $nonnet_crawl_times = array_diff($nonnet_crawl_times, $network_crawl_times);
     $all_crawl_times = array_values(array_merge($nonnet_crawl_times, $network_crawl_times));
     sort($all_crawl_times, SORT_STRING);
     //Get past and future crawl times
     foreach ($all_crawl_times as $time) {
         if ($time >= $crawl_time_int) {
             array_push($all_future_times, $time);
         } else {
             array_push($all_past_times, $time);
         }
     }
     /*Get the nearest timestamp (future or past)
      *Check in future first and if not found, check in past
      */
     if (!empty($all_future_times)) {
         $crawl_time = array_shift($all_future_times);
         array_push($all_future_times, $crawl_time);
         sort($all_future_times, SORT_STRING);
         if (in_array($crawl_time, $network_crawl_times)) {
             $queue_servers = $network_crawl_items['queue_servers'];
         } else {
             $queue_servers = $nonnet_crawl_items['queue_servers'];
         }
     } else {
         if (!empty($all_past_times)) {
             $crawl_time = array_pop($all_past_times);
             array_push($all_past_times, $crawl_time);
             sort($all_past_times, SORT_STRING);
             if (in_array($crawl_time, $network_crawl_times)) {
                 $queue_servers = $network_crawl_items['queue_servers'];
             } else {
                 $queue_servers = $nonnet_crawl_items['queue_servers'];
             }
         }
     }
     $this->model("phrase")->index_name = $crawl_time;
     $crawl_model->index_name = $crawl_time;
     $crawl_item = $crawl_model->getCrawlItem($url, $queue_servers);
     // A crawl item is able to override the default UI_FLAGS
     if (isset($crawl_item[self::UI_FLAGS]) && is_string($crawl_item[self::UI_FLAGS])) {
         $ui_flags = explode(",", $crawl_item[self::UI_FLAGS]);
     }
     $data = array();
     if ($crawl_item == NULL) {
         if ($cached_link == true) {
             header("Location: {$url}");
         } else {
             $data["URL"] = $url;
             $this->displayView("nocache", $data);
             return;
         }
     }
     $check_fields = array(self::TITLE, self::DESCRIPTION, self::LINKS);
     foreach ($check_fields as $field) {
         $crawl_item[$field] = isset($crawl_item[$field]) ? $crawl_item[$field] : "";
     }
     $summary_string = $this->crawlItemSummary($crawl_item);
     $robot_instance = $crawl_item[self::ROBOT_INSTANCE];
     $robot_table_name = CRAWL_DIR . "/" . self::robot_table_name;
     $robot_table = array();
     if (file_exists($robot_table_name)) {
         $robot_table = unserialize(file_get_contents($robot_table_name));
     }
     if (isset($robot_table[$robot_instance])) {
         $machine = $robot_table[$robot_instance][0];
         $machine_uri = $robot_table[$robot_instance][1];
     } else {
         //guess we are in a single machine setting
         $machine = UrlParser::getHost(NAME_SERVER);
         if ($machine[4] == 's') {
             // start with https://
             $machine = substr($machine, 8);
         } else {
             // start with http://
             $machine = substr($machine, 7);
         }
         $machine_uri = WEB_URI;
     }
     $instance_parts = explode("-", $robot_instance);
     $instance_num = false;
     if (count($instance_parts) > 1) {
         $instance_num = intval($instance_parts[0]);
     }
     $offset = $crawl_item[self::OFFSET];
     $cache_partition = $crawl_item[self::CACHE_PAGE_PARTITION];
     $cache_item = $crawl_model->getCacheFile($machine, $machine_uri, $cache_partition, $offset, $crawl_time, $instance_num);
     if (!isset($cache_item[self::PAGE])) {
         $data["URL"] = $url;
         $data["SUMMARY_STRING"] = "\n\n" . tl('search_controller_download_fetcher', $robot_instance) . "\n\n" . $summary_string;
         $this->displayView("nocache", $data);
         return;
     }
     if (isset($crawl_item[self::ROBOT_METAS]) && (in_array("NOARCHIVE", $crawl_item[self::ROBOT_METAS]) || in_array("NONE", $crawl_item[self::ROBOT_METAS]))) {
         $cache_file = "<div>'.\n                tl('search_controller_no_archive_page').'</div>";
     } else {
         $cache_file = $cache_item[self::PAGE];
     }
     if (isset($crawl_item[self::THUMB])) {
         $cache_file = $this->imageCachePage($url, $cache_item, $cache_file, $queue_servers);
         unset($ui_flags["highlight"]);
     }
     if (isset($crawl_item[self::KEYWORD_LINKS])) {
         $cache_item[self::KEYWORD_LINKS] = $crawl_item[self::KEYWORD_LINKS];
     }
     if (!isset($cache_item[self::ROBOT_INSTANCE])) {
         $cache_item[self::ROBOT_INSTANCE] = $robot_instance;
     }
     if (in_array('yioop_nav', $ui_flags) && !(isset($_SERVER['_']) && stristr($_SERVER['_'], 'hhvm') || isset($_SERVER['SERVER_SOFTWARE']) && $_SERVER['SERVER_SOFTWARE'] == "HPHP")) {
         $new_doc = $this->formatCachePage($cache_item, $cache_file, $url, $summary_string, $crawl_time, $all_crawl_times, $terms, $ui_flags);
     } else {
         $new_doc = $cache_file;
     }
     if (USE_CACHE) {
         $CACHE->set($hash_key, $new_doc);
     }
     echo $new_doc;
 }
コード例 #11
0
ファイル: source_model.php プロジェクト: yakar/yioop
 /**
  * Copies all feeds items newer than $age to a new shard, then deletes
  * old index shard and database entries older than $age. Finally sets copied
  * shard to be active. If this method is going to take max_execution_time/2
  * it returns false, so an additional job can be schedules; otherwise
  * it returns true
  *
  * @param int $age how many seconds old records should be deleted
  * @return bool whether job executed to complete
  */
 function rebuildFeedShard($age)
 {
     $time = time();
     $feed_shard_name = WORK_DIRECTORY . "/feeds/index";
     $prune_shard_name = WORK_DIRECTORY . "/feeds/prune_index";
     $prune_shard = new IndexShard($prune_shard_name);
     $too_old = $time - $age;
     if (!$prune_shard) {
         return false;
     }
     $pre_feeds = $this->getNewsSources();
     if (!$pre_feeds) {
         return false;
     }
     $feeds = array();
     foreach ($pre_feeds as $pre_feed) {
         if (!isset($pre_feed['NAME'])) {
             continue;
         }
         $feeds[$pre_feed['NAME']] = $pre_feed;
     }
     $db = $this->db;
     // we now rebuild the inverted index with the remaining items
     $sql = "SELECT * FROM FEED_ITEM " . "WHERE PUBDATE >= ? " . "ORDER BY PUBDATE DESC";
     $result = $db->execute($sql, array($too_old));
     if ($result) {
         $completed = true;
         crawlLog("..still deleting. Making new index of non-pruned items.");
         $i = 0;
         while ($item = $db->fetchArray($result)) {
             crawlTimeoutLog("..have added %s non-pruned items to index.", $i);
             $i++;
             if (!isset($item['SOURCE_NAME'])) {
                 continue;
             }
             $source_name = $item['SOURCE_NAME'];
             if (isset($feeds[$source_name])) {
                 $lang = $feeds[$source_name]['LANGUAGE'];
             } else {
                 $lang = "";
             }
             $phrase_string = $item["TITLE"] . " " . $item["DESCRIPTION"];
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $raw_guid = unbase64Hash($item["GUID"]);
             $doc_keys = crawlHash($item["LINK"], true) . $raw_guid . "d" . substr(crawlHash(UrlParser::getHost($item["LINK"]) . "/", true), 1);
             $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"]);
             $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
         }
     }
     $prune_shard->save();
     @chmod($prune_shard_name, 0777);
     @chmod($feed_shard_name, 0777);
     @rename($prune_shard_name, $feed_shard_name);
     @chmod($feed_shard_name, 0777);
     $sql = "DELETE FROM FEED_ITEM WHERE PUBDATE < ?";
     $db->execute($sql, array($too_old));
 }