Пример #1
0
 /**
  * Sets contents of passed object to values from current object.
  *
  * If desired, this method can also make copies of all associated (fkey referrers)
  * objects.
  *
  * @param      object $copyObj An object of Meta (or compatible) type.
  * @param      boolean $deepCopy Whether to also copy all rows that refer (by fkey) to the current row.
  * @throws     PropelException
  */
 public function copyInto($copyObj, $deepCopy = false)
 {
     $copyObj->setTitle($this->title);
     $copyObj->setKeywords($this->keywords);
     $copyObj->setDescription($this->description);
     $copyObj->setIsActive($this->is_active);
     $copyObj->setCreatedAt($this->created_at);
     if ($deepCopy) {
         // important: temporarily setNew(false) because this affects the behavior of
         // the getter/setter methods for fkey referrer objects.
         $copyObj->setNew(false);
         foreach ($this->getPartss() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addParts($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getAuthorss() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addAuthors($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getPhotoss() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addPhotos($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getPagess() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addPages($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getFaqs() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addFaq($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getArticless() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addArticles($relObj->copy($deepCopy));
             }
         }
         foreach ($this->getNewss() as $relObj) {
             if ($relObj !== $this) {
                 // ensure that we don't try to copy a reference to ourselves
                 $copyObj->addNews($relObj->copy($deepCopy));
             }
         }
     }
     // if ($deepCopy)
     $copyObj->setNew(true);
     $copyObj->setId(NULL);
     // this is a auto-increment column, so set to default value
 }
Пример #2
0
 /**
  * Add the array of $pages to the summaries WebArchiveBundle pages being
  * stored in the partition $generation and the field used
  * to store the resulting offsets given by $offset_field.
  *
  * @param int $generation field used to select partition
  * @param string $offset_field field used to record offsets after storing
  * @param array& $pages data to store
  * @param int $visited_urls_count number to add to the count of visited urls
  *     (visited urls is a smaller number than the total count of objects
  *     stored in the index).
  */
 function addPages($generation, $offset_field, &$pages, $visited_urls_count)
 {
     $this->summaries->setWritePartition($generation);
     $this->summaries->addPages($offset_field, $pages);
     $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT");
 }
Пример #3
0
 /**
  * Adds the summary and index data in $file to summary bundle and word index
  *
  * @param string $file containing web pages summaries and a mini-inverted
  *     index for their content
  * @param bool $blocking this method might be called by the indexer
  *     subcomponent when a merge tier phase is ongoing to allow for
  *     other processing to occur. If so, we don't want a regress
  *     where the indexer calls this code calls the indexer etc. If
  *     the blocking flag is set then the indexer subcomponent won't
  *     be called
  */
 function processIndexArchive($file, $blocking)
 {
     static $blocked = false;
     if ($blocking && $blocked) {
         crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B");
         return;
     }
     if (!$blocking) {
         $blocked = false;
     }
     crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "...");
     crawlLog("Indexer: Processing index data in {$file}...");
     $start_time = microtime();
     $start_total_time = microtime();
     $pre_sites = webdecode(file_get_contents($file));
     $len_urls = unpackInt(substr($pre_sites, 0, 4));
     $seen_urls_string = substr($pre_sites, 4, $len_urls);
     $pre_sites = substr($pre_sites, 4 + $len_urls);
     $sites[self::SEEN_URLS] = array();
     $pos = 0;
     $num = 0;
     $bad = false;
     $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1);
     while ($pos < $len_urls && $num <= $max_batch_sites_and_links) {
         crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls);
         $len_site = unpackInt(substr($seen_urls_string, $pos, 4));
         if ($len_site > 2 * $this->page_range_request) {
             crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $pos += 4;
         $site_string = substr($seen_urls_string, $pos, $len_site);
         $pos += strlen($site_string);
         $tmp = unserialize(gzuncompress($site_string));
         if (!$tmp || !is_array($tmp)) {
             crawlLog("Compressed array null," . " data file may be corrupted? Skip rest.");
             $bad = true;
             break;
         }
         $sites[self::SEEN_URLS][] = $tmp;
         $num++;
     }
     if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) {
         crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file.");
         crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
         unlink($file);
         return;
     }
     crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites);
     unset($pre_sites);
     crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     $start_time = microtime();
     //do deduplication of summaries
     if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) {
         $seen_sites = $sites[self::SEEN_URLS];
         $seen_sites = array_values($seen_sites);
         unset($sites[self::SEEN_URLS]);
         $num_seen = count($seen_sites);
         crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites.");
     } else {
         $num_seen = 0;
     }
     $visited_urls_count = 0;
     $recent_urls_count = 0;
     $recent_urls = array();
     for ($i = 0; $i < $num_seen; $i++) {
         $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true);
         $link_url_parts = explode("|", $seen_sites[$i][self::URL]);
         if (strcmp("url", $link_url_parts[0]) == 0) {
             $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i";
             $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1);
             $seen_sites[$i][self::IS_DOC] = false;
         } else {
             $seen_sites[$i][self::IS_DOC] = true;
             $visited_urls_count++;
             array_push($recent_urls, $seen_sites[$i][self::URL]);
             if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) {
                 array_shift($recent_urls);
             }
             $recent_urls_count++;
         }
     }
     if (isset($sites[self::INVERTED_INDEX])) {
         $index_shard =& $sites[self::INVERTED_INDEX];
         $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking);
         if ($generation == -1) {
             crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A");
             $blocked = true;
             return;
         }
         $summary_offsets = array();
         if (isset($seen_sites)) {
             $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count);
             foreach ($seen_sites as $site) {
                 if ($site[self::IS_DOC]) {
                     // so not link
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                 } else {
                     $hash = $site[self::HASH_URL];
                 }
                 $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
             }
             unset($seen_sites);
         }
         crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         // added summary offset info to inverted index data
         $index_shard->changeDocumentOffsets($summary_offsets);
         crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
         $start_time = microtime();
         $this->index_archive->addIndexData($index_shard);
         $this->index_dirty = true;
     }
     crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time));
     crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time));
     if (isset($recent_urls)) {
         $sites[self::RECENT_URLS] =& $recent_urls;
         $this->writeCrawlStatus($sites);
     }
     if (file_exists($file)) {
         //Haven't tracked down yet, but can try to delete twice giving warn
         unlink($file);
     }
 }
Пример #4
0
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }