/** * Sets contents of passed object to values from current object. * * If desired, this method can also make copies of all associated (fkey referrers) * objects. * * @param object $copyObj An object of Meta (or compatible) type. * @param boolean $deepCopy Whether to also copy all rows that refer (by fkey) to the current row. * @throws PropelException */ public function copyInto($copyObj, $deepCopy = false) { $copyObj->setTitle($this->title); $copyObj->setKeywords($this->keywords); $copyObj->setDescription($this->description); $copyObj->setIsActive($this->is_active); $copyObj->setCreatedAt($this->created_at); if ($deepCopy) { // important: temporarily setNew(false) because this affects the behavior of // the getter/setter methods for fkey referrer objects. $copyObj->setNew(false); foreach ($this->getPartss() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addParts($relObj->copy($deepCopy)); } } foreach ($this->getAuthorss() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addAuthors($relObj->copy($deepCopy)); } } foreach ($this->getPhotoss() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addPhotos($relObj->copy($deepCopy)); } } foreach ($this->getPagess() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addPages($relObj->copy($deepCopy)); } } foreach ($this->getFaqs() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addFaq($relObj->copy($deepCopy)); } } foreach ($this->getArticless() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addArticles($relObj->copy($deepCopy)); } } foreach ($this->getNewss() as $relObj) { if ($relObj !== $this) { // ensure that we don't try to copy a reference to ourselves $copyObj->addNews($relObj->copy($deepCopy)); } } } // if ($deepCopy) $copyObj->setNew(true); $copyObj->setId(NULL); // this is a auto-increment column, so set to default value }
/** * Add the array of $pages to the summaries WebArchiveBundle pages being * stored in the partition $generation and the field used * to store the resulting offsets given by $offset_field. * * @param int $generation field used to select partition * @param string $offset_field field used to record offsets after storing * @param array& $pages data to store * @param int $visited_urls_count number to add to the count of visited urls * (visited urls is a smaller number than the total count of objects * stored in the index). */ function addPages($generation, $offset_field, &$pages, $visited_urls_count) { $this->summaries->setWritePartition($generation); $this->summaries->addPages($offset_field, $pages); $this->summaries->addCount($visited_urls_count, "VISITED_URLS_COUNT"); }
/** * Adds the summary and index data in $file to summary bundle and word index * * @param string $file containing web pages summaries and a mini-inverted * index for their content * @param bool $blocking this method might be called by the indexer * subcomponent when a merge tier phase is ongoing to allow for * other processing to occur. If so, we don't want a regress * where the indexer calls this code calls the indexer etc. If * the blocking flag is set then the indexer subcomponent won't * be called */ function processIndexArchive($file, $blocking) { static $blocked = false; if ($blocking && $blocked) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. B"); return; } if (!$blocking) { $blocked = false; } crawlLog("{$this->server_name} is starting to process index data," . " memory usage: " . memory_get_usage() . "..."); crawlLog("Indexer: Processing index data in {$file}..."); $start_time = microtime(); $start_total_time = microtime(); $pre_sites = webdecode(file_get_contents($file)); $len_urls = unpackInt(substr($pre_sites, 0, 4)); $seen_urls_string = substr($pre_sites, 4, $len_urls); $pre_sites = substr($pre_sites, 4 + $len_urls); $sites[self::SEEN_URLS] = array(); $pos = 0; $num = 0; $bad = false; $max_batch_sites_and_links = SEEN_URLS_BEFORE_UPDATE_SCHEDULER * (max(MAX_LINKS_PER_PAGE, MAX_LINKS_PER_SITEMAP) + 1); while ($pos < $len_urls && $num <= $max_batch_sites_and_links) { crawlTimeoutLog("..Indexer still processing index data at position" . " %s of out of %s", $pos, $len_urls); $len_site = unpackInt(substr($seen_urls_string, $pos, 4)); if ($len_site > 2 * $this->page_range_request) { crawlLog("Indexer: Site string too long, {$len_site}," . " data file may be corrupted? Skip rest."); $bad = true; break; } $pos += 4; $site_string = substr($seen_urls_string, $pos, $len_site); $pos += strlen($site_string); $tmp = unserialize(gzuncompress($site_string)); if (!$tmp || !is_array($tmp)) { crawlLog("Compressed array null," . " data file may be corrupted? Skip rest."); $bad = true; break; } $sites[self::SEEN_URLS][] = $tmp; $num++; } if ($num > $max_batch_sites_and_links * SEEN_URLS_BEFORE_UPDATE_SCHEDULER || $bad) { crawlLog("Index data file len_urls was {$len_urls} num was {$num}, " . "may be corrupt so skipping this file."); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); unlink($file); return; } crawlLog("A. Indexer Load SEEN_URLS. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $sites[self::INVERTED_INDEX] = IndexShard::load("fetcher_shard", $pre_sites); unset($pre_sites); crawlLog("B. Indexer Load Sent shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); //do deduplication of summaries if (isset($sites[self::SEEN_URLS]) && count($sites[self::SEEN_URLS]) > 0) { $seen_sites = $sites[self::SEEN_URLS]; $seen_sites = array_values($seen_sites); unset($sites[self::SEEN_URLS]); $num_seen = count($seen_sites); crawlLog("Indexer: SEEN_URLS array had {$num_seen} sites."); } else { $num_seen = 0; } $visited_urls_count = 0; $recent_urls_count = 0; $recent_urls = array(); for ($i = 0; $i < $num_seen; $i++) { $seen_sites[$i][self::HASH_URL] = crawlHash($seen_sites[$i][self::URL], true); $link_url_parts = explode("|", $seen_sites[$i][self::URL]); if (strcmp("url", $link_url_parts[0]) == 0) { $reftype = strcmp("eref", $link_url_parts[4]) == 0 ? "e" : "i"; $seen_sites[$i][self::HASH_URL] = crawlHash($link_url_parts[1], true) . crawlHash($seen_sites[$i][self::URL], true) . $reftype . substr(crawlHash(UrlParser::getHost($link_url_parts[5]) . "/", true), 1); $seen_sites[$i][self::IS_DOC] = false; } else { $seen_sites[$i][self::IS_DOC] = true; $visited_urls_count++; array_push($recent_urls, $seen_sites[$i][self::URL]); if ($recent_urls_count >= NUM_RECENT_URLS_TO_DISPLAY) { array_shift($recent_urls); } $recent_urls_count++; } } if (isset($sites[self::INVERTED_INDEX])) { $index_shard =& $sites[self::INVERTED_INDEX]; $generation = $this->index_archive->initGenerationToAdd($index_shard->num_docs, $this, $blocking); if ($generation == -1) { crawlLog("Indexer waiting for merge tiers to " . "complete before write partition. A"); $blocked = true; return; } $summary_offsets = array(); if (isset($seen_sites)) { $this->index_archive->addPages($generation, self::SUMMARY_OFFSET, $seen_sites, $visited_urls_count); foreach ($seen_sites as $site) { if ($site[self::IS_DOC]) { // so not link $site_url = str_replace('|', "%7C", $site[self::URL]); $host = UrlParser::getHost($site_url); $hash = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1); } else { $hash = $site[self::HASH_URL]; } $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET]; } unset($seen_sites); } crawlLog("C. Indexer init local shard, store " . "Summaries memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); // added summary offset info to inverted index data $index_shard->changeDocumentOffsets($summary_offsets); crawlLog("D. Indexer Update shard offsets. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); $start_time = microtime(); $this->index_archive->addIndexData($index_shard); $this->index_dirty = true; } crawlLog("E. Indexer Add index shard. Memory usage:" . memory_get_usage() . " time: " . changeInMicrotime($start_time)); crawlLog("Indexer Done Index Processing File: {$file}. Total time: " . changeInMicrotime($start_total_time)); if (isset($recent_urls)) { $sites[self::RECENT_URLS] =& $recent_urls; $this->writeCrawlStatus($sites); } if (file_exists($file)) { //Haven't tracked down yet, but can try to delete twice giving warn unlink($file); } }
/** * Processes an array of downloaded web pages with the appropriate page * processor. * * Summary data is extracted from each non robots.txt file in the array. * Disallowed paths and crawl-delays are extracted from robots.txt files. * * @param array $site_pages a collection of web pages to process * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { $PAGE_PROCESSORS = $this->page_processors; crawlLog("Start process pages... Current Memory:" . memory_get_usage()); $start_time = microtime(); $prefix = $this->fetcher_num . "-"; $stored_site_pages = array(); $summarized_site_pages = array(); $num_items = $this->web_archive->count; $i = 0; foreach ($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; $was_error = false; if ($response_code < 200 || $response_code >= 300) { crawlLog($site[self::URL] . " response code {$response_code}"); $host = UrlParser::getHost($site[self::URL]); if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } if ($response_code >= 400 || $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; $this->hosts_with_errors[$host]++; } /* we print out errors to std output. We still go ahead and process the page. Maybe it is a cool error page, also this makes sure we don't crawl it again */ } // text/robot is my made up mimetype for robots.txt files $was_robot_error = false; if (isset($site[self::ROBOT_PATHS])) { if (!$was_error) { $type = "text/robot"; } else { $type = $site[self::TYPE]; if ($response_code != 404) { /* disallow crawling if robots.txt was any error other that not found */ $was_robot_error = true; $site[self::ROBOT_PATHS][] = "/"; } } } else { if (isset($site[self::FILE_NAME])) { $extension = UrlParser::getDocumentType($site[self::FILE_NAME]); if ($extension == $this->programming_language_extension['java']) { $type = "text/java"; } else { if ($extension == $this->programming_language_extension['py']) { $type = "text/py"; } else { $type = $site[self::TYPE]; } } } else { $type = $site[self::TYPE]; } } $handled = false; /*deals with short URLs and directs them to the original link for robots.txt don't want to introduce stuff that can be mis-parsed (we follow redirects in this case anyway) */ if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) { array_unshift($site[self::LOCATION], $site[self::URL]); $tmp_loc = array_pop($site[self::LOCATION]); $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]); $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc); $doc_info = array(); $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL]; $doc_info[self::LOCATION] = true; $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc; $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION]; $doc_info[self::TITLE] = $site[self::URL]; $text_data = true; if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } $handled = true; } else { if (isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; if (generalIsA($page_processor, "TextProcessor")) { $text_data = true; } else { $text_data = false; } } else { crawlLog("No page processor for mime type: " . $type); crawlLog("Not processing: " . $site[self::URL]); continue; } } if (!$handled) { if (isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option); } else { $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option); } } if (isset($site[self::PAGE]) && !$handled) { if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } //if not UTF-8 convert before doing anything else if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) { if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) { crawlLog(" MB_CHECK_ENCODING FAILED!!"); } crawlLog(" Converting from encoding " . $site[self::ENCODING] . "..."); //if HEBREW WINDOWS-1255 use ISO-8859 instead if (stristr($site[self::ENCODING], "1255")) { $site[self::ENCODING] = "ISO-8859-8"; crawlLog(" using encoding " . $site[self::ENCODING] . "..."); } if (stristr($site[self::ENCODING], "1256")) { $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]); crawlLog(" using Yioop hack encoding ..."); } else { $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]); } } crawlLog(" Using Processor..." . $page_processor); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $tmp_url_store = $site[self::URL]; $site[self::URL] = $site[self::FILE_NAME]; } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; } if (!$doc_info) { crawlLog(" Processing Yielded No Data For: " . $site[self::URL]); } if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time); } } else { if (!$handled) { $doc_info = false; } } $not_loc = true; if ($doc_info) { $site[self::DOC_INFO] = $doc_info; if (isset($doc_info[self::LOCATION])) { $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true); $not_loc = false; } $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE; if (!is_dir(CRAWL_DIR . "/cache")) { mkdir(CRAWL_DIR . "/cache"); $htaccess = "Options None\nphp_flag engine off\n"; file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess); } if ($type == "text/robot" && isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } if ($text_data) { if (isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } if ($not_loc) { $content = $doc_info[self::DESCRIPTION]; $site[self::HASH] = FetchUrl::computePageHash($content); } } else { $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); } if (isset($doc_info[self::WORD_CLOUD])) { $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; } else { $site[self::WORD_CLOUD] = NULL; } if (isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if (!isset($site[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array(); } if (isset($doc_info[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]); } //here's where we enforce NOFOLLOW if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) { $site[self::DOC_INFO][self::LINKS] = array(); } if (isset($doc_info[self::AGENT_LIST])) { $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST]; } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME]; } else { $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side } if (!isset($site[self::REPOSITORY_TYPE])) { if ($was_robot_error) { $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION]; } $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]); } else { $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION]; } if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } if (isset($site[self::DOC_INFO][self::META_WORDS])) { if (!isset($summarized_site_pages[$i][self::META_WORDS])) { $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS]; } else { $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]); } } if (isset($site[self::DOC_INFO][self::LANG])) { if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]); } $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG]; } if (isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) { $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD]; } if (isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; } if (isset($site[self::DOC_INFO][self::SUBDOCS])) { $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages); } if (isset($summarized_site_pages[$i][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]); } if (!empty($this->classifiers)) { Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers); } if ($this->page_rule_parser != NULL) { $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]); } $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array(); if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) { $stored_site_pages[$i] = false; } $stored_site_pages[$i][self::INDEX] = $i; $i++; } } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); if ($num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored); } else { if ($num_pages > 0) { $this->web_archive->addCount(count($filter_stored)); } } for ($i = 0; $i < $num_pages; $i++) { $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET]; $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition; } } crawlLog(" Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage()); return $summarized_site_pages; }