/** * Check if can go from a relative link, base link to a complete link * in various different ways */ function canonicalLinkTestCase() { $test_links = array(array(".", "http://www.example.com/", "http://www.example.com/", "root dir0"), array("/bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir1"), array("bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir2"), array("bob", "http://www.example.com/", "http://www.example.com/bob", "root dir3"), array("bob", "http://www.example.com", "http://www.example.com/bob", "root dir4"), array("http://print.bob.com/bob", "http://www.example.com", "http://print.bob.com/bob", "root dir5"), array("/.", "http://www.example.com/", "http://www.example.com/", "root dir6"), array("//slashdot.org", "http://www.slashdot.org", "http://slashdot.org/", "slashdot dir"), array("bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir1"), array("../bob", "http://www.example.com/a", "http://www.example.com/bob", "sub dir2"), array("../../bob", "http://www.example.com/a", NULL, "sub dir3"), array("./bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir4"), array("bob.html?a=1", "http://www.example.com/a", "http://www.example.com/a/bob.html?a=1", "query 1"), array("bob?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/bob?a=1&b=2", "query 2"), array("/?a=1&b=2", "http://www.example.com/a", "http://www.example.com/?a=1&b=2", "query 3"), array("?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/?a=1&b=2", "query 4"), array("b/b.html?a=1&b=2", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2", "query 5"), array("b/b.html?a=1&b=2?c=4", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2?c=4", "query 6"), array("b#1", "http://www.example.com/", "http://www.example.com/b#1", "fragment 1"), array("b?a=1#1", "http://www.example.com/", "http://www.example.com/b?a=1#1", "fragment 2"), array("b?a=1#1#2", "http://www.example.com/", "http://www.example.com/b?a=1#1#2", "fragment 3"), array("#a", "http://www.example.com/c:d", "http://www.example.com/c:d#a", "fragment 4")); foreach ($test_links as $test_link) { $result = UrlParser::canonicalLink($test_link[0], $test_link[1], false); $this->assertEqual($result, $test_link[2], $test_link[3]); } }
/** * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $base_refs = $xpath->evaluate("/html//base"); if ($base_refs->item(0)) { $tmp_site = $base_refs->item(0)->getAttribute('href'); if (strlen($tmp_site) > 0) { $site = UrlParser::canonicalLink($tmp_site, $site); } } $i = 0; $hrefs = $xpath->evaluate("/html/body//a"); foreach ($hrefs as $href) { if ($i < MAX_LINKS_TO_EXTRACT) { $rel = $href->getAttribute("rel"); if ($rel == "" || !stristr($rel, "nofollow")) { $url = UrlParser::canonicalLink($href->getAttribute('href'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { $text = $href->nodeValue; if (isset($sites[$url])) { $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text)); $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } } $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe"); foreach ($frames as $frame) { if ($i < MAX_LINKS_TO_EXTRACT) { $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. HTMLframe"; } else { $sites[$url] = "HTMLframe"; } $i++; } } } $imgs = $xpath->evaluate("/html/body//img[@alt]"); $i = 0; foreach ($imgs as $img) { if ($i < MAX_LINKS_TO_EXTRACT) { $alt = $img->getAttribute('alt'); if (strlen($alt) < 1) { continue; } $url = UrlParser::canonicalLink($img->getAttribute('src'), $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) { if (isset($sites[$url])) { $sites[$url] .= " .. " . $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } else { $sites[$url] = $alt; $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT); } $i++; } } } return $sites; }
/** * Handles admin request related to the managing the machines which perform * crawls * * With this activity an admin can add/delete machines to manage. For each * managed machine, the admin can stop and start fetchers/queue_servers * as well as look at their log files * * @return array $data MACHINES, their MACHINE_NAMES, data for * FETCHER_NUMBERS drop-down */ function manageMachines() { $parent = $this->parent; $machine_model = $parent->model("machine"); $profile_model = $parent->model("profile"); $data = array(); $data["ELEMENT"] = "managemachines"; $possible_arguments = array("addmachine", "deletemachine", "newsmode", "log", "update"); $data['SCRIPT'] = "doUpdate();"; $data["leftorright"] = getLocaleDirection() == 'ltr' ? "right" : "left"; $data['MACHINE_NAMES'] = array(); $data['FETCHER_NUMBERS'] = array(0 => 0, 1 => 1, 2 => 2, 3 => 3, 4 => 4, 5 => 5, 6 => 6, 7 => 7, 8 => 8, 16 => 16); $tmp = tl('system_component_select_machine'); if (!isset($_REQUEST["has_queue_server"]) || isset($_REQUEST['is_replica'])) { $_REQUEST["has_queue_server"] = false; } if (isset($_REQUEST['is_replica'])) { $_REQUEST['num_fetchers'] = 0; } else { $_REQUEST['parent'] = ""; } $request_fields = array("name" => "string", "url" => "string", "has_queue_server" => "bool", "num_fetchers" => "int", "parent" => "string"); $r = array(); $allset = true; foreach ($request_fields as $field => $type) { if (isset($_REQUEST[$field])) { $r[$field] = $parent->clean($_REQUEST[$field], $type); if ($type == "string") { $r[$field] = trim($r[$field]); if ($r[$field] == "" && $field != "parent") { $allset = false; } } if ($field == "url") { if (isset($r[$field][strlen($r[$field]) - 1]) && $r[$field][strlen($r[$field]) - 1] != "/") { $r[$field] .= "/"; } $r[$field] = UrlParser::canonicalLink($r[$field], NAME_SERVER); if (!$r[$field]) { $allset = false; } } } else { $allset = false; } } if (isset($r["num_fetchers"]) && in_array($r["num_fetchers"], $data['FETCHER_NUMBERS'])) { $data['FETCHER_NUMBER'] = $r["num_fetchers"]; } else { $data['FETCHER_NUMBER'] = 0; if (isset($r["num_fetchers"])) { $r["num_fetchers"] = 0; } } $machine_exists = isset($r["name"]) && $machine_model->checkMachineExists("NAME", $r["name"]) || isset($r["url"]) && $machine_model->checkMachineExists("URL", $r["url"]); if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "addmachine": if ($allset == true && !$machine_exists) { $machine_model->addMachine($r["name"], $r["url"], $r["has_queue_server"], $r["num_fetchers"], $r["parent"]); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_added') . "</h1>');"; $data['MACHINE_NAMES'][] = $r["name"]; $data['DELETABLE_MACHINES'][$r["name"]] = $r["name"]; sort($data['MACHINE_NAMES']); } else { if ($allset && $machine_exists) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_exists') . "</h1>');"; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_incomplete') . "</h1>');"; } } break; case "deletemachine": if (!$machine_exists) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_doesnt_exists') . "</h1>');"; } else { $machines = $machine_model->getRows(0, 1, $total_rows, array(array("name", "=", $r["name"], ""))); $service_in_use = false; foreach ($machines as $machine) { if ($machine['NAME'] == $r["name"]) { if (isset($machine['STATUSES']) && is_array($machine['STATUSES']) && $machine['STATUSES'] != array()) { $service_in_use = true; break; } else { break; } } } if ($service_in_use) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_stop_service_first') . "</h1>');"; break; } $machine_model->deleteMachine($r["name"]); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_deleted') . "</h1>');"; } break; case "newsmode": $profile = $profile_model->getProfile(WORK_DIRECTORY); $news_modes = array("news_off", "news_web", "news_process"); if (isset($_REQUEST['news_mode']) && in_array($_REQUEST['news_mode'], $news_modes)) { $profile["NEWS_MODE"] = $_REQUEST['news_mode']; if ($profile["NEWS_MODE"] != "news_process") { CrawlDaemon::stop("news_updater", "", false); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');"; } else { CrawlDaemon::start("news_updater", 'none', "", -1); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');"; } $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile); } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_update_failed') . "</h1>');"; } break; case "log": if (isset($_REQUEST["fetcher_num"])) { $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int"); } if (isset($_REQUEST["mirror_name"])) { $r["mirror_name"] = $parent->clean($_REQUEST["mirror_name"], "string"); } if (isset($_REQUEST["time"])) { $data["time"] = $parent->clean($_REQUEST["time"], "int") + 30; } else { $data["time"] = 30; } if (isset($_REQUEST["NO_REFRESH"])) { $data["NO_REFRESH"] = $parent->clean($_REQUEST["NO_REFRESH"], "bool"); } else { $data["NO_REFRESH"] = false; } $data["ELEMENT"] = "machinelog"; $filter = ""; if (isset($_REQUEST['f'])) { $filter = $parent->clean($_REQUEST['f'], "string"); } $data['filter'] = $filter; $data["REFRESH_LOG"] = "&time=" . $data["time"]; $data["LOG_TYPE"] = ""; if (isset($r['fetcher_num']) && isset($r['name'])) { $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], $r["fetcher_num"], $filter); $data["LOG_TYPE"] = $r['name'] . " fetcher " . $r["fetcher_num"]; $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'] . "&fetcher_num=" . $r['fetcher_num']; } else { if (isset($r["mirror_name"])) { $data["LOG_TYPE"] = $r['mirror_name'] . " mirror"; $data["LOG_FILE_DATA"] = $machine_model->getLog($r["mirror_name"], NULL, $filter, true); } else { if (isset($r['name'])) { $data["LOG_TYPE"] = $r['name'] . " queue_server"; if ($r['name'] == "news") { $data["LOG_TYPE"] = "Name Server News Updater"; } $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], NULL, $filter); $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name']; } } } if ($data["time"] >= ONE_HOUR / 3) { $data["REFRESH_LOG"] = ""; } if (!isset($data["LOG_FILE_DATA"]) || $data["LOG_FILE_DATA"] == "") { $data["LOG_FILE_DATA"] = tl('system_component_no_machine_log'); } $lines = array_reverse(explode("\n", $data["LOG_FILE_DATA"])); $data["LOG_FILE_DATA"] = implode("\n", $lines); break; case "update": if (isset($_REQUEST["fetcher_num"])) { $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int"); } else { $r["fetcher_num"] = NULL; } $available_actions = array("start", "stop", "mirror_start", "mirror_stop"); if (isset($r["name"]) && isset($_REQUEST["action"]) && in_array($_REQUEST["action"], $available_actions)) { $action = $_REQUEST["action"]; $is_mirror = false; if ($action == "mirror_start") { $action = "start"; $is_mirror = true; } else { if ($action == "mirror_stop") { $action = "stop"; $is_mirror = true; } } $machine_model->update($r["name"], $action, $r["fetcher_num"], $is_mirror); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_servers_updated') . "</h1>');"; } else { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_no_action') . "</h1>');"; } break; } } $parent->pagingLogic($data, $machine_model, "MACHINE", DEFAULT_ADMIN_PAGING_NUM); if (!isset($_REQUEST['arg']) || $_REQUEST['arg'] != 'log') { $data['SCRIPT'] .= "toggleReplica(false);"; } return $data; }
/** * Returns links from the supplied dom object of a sitemap * where links have been canonicalized according to * the supplied $site information. We allow more links from a sitemap * than from other kinds of documents. For now we are ignoring weighting * info * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9"); $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc"); $i = 0; foreach ($paths as $path) { $nodes = @$xpath->evaluate($path); foreach ($nodes as $node) { $url = UrlParser::canonicalLink($node->textContent, $site); if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) { //at this point we can't handle gzip'd sitemaps continue; } $sites[$url] = "From sitemap of " . $site; $i++; if ($i > MAX_LINKS_PER_SITEMAP) { break 2; } } } return $sites; }
/** * Handles admin request related to the search sources activity * * The search sources activity allows a user to add/delete search sources * for video and news, it also allows a user to control which subsearches * appear on the SearchView page * * @return array $data info about current search sources, and current * sub-searches */ function searchSources() { $parent = $this->parent; $crawl_model = $parent->model("crawl"); $source_model = $parent->model("source"); $possible_arguments = array("addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch"); $data = array(); $data["ELEMENT"] = "searchsources"; $data['SCRIPT'] = ""; $data['SOURCE_TYPES'] = array(-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "html" => tl('crawl_component_html_feed')); $source_type_flag = false; if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) { $data['SOURCE_TYPE'] = $_REQUEST['type']; $source_type_flag = true; } else { $data['SOURCE_TYPE'] = -1; } $machine_urls = $parent->model("machine")->getQueueServerUrls(); $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls); $data["SEARCH_LISTS"] = array(-1 => tl('crawl_component_sources_indexes')); foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["i:" . $item["CRAWL_TIME"]] = $item["DESCRIPTION"]; } if (isset($_SESSION['USER_ID'])) { $user = $_SESSION['USER_ID']; } else { $user = $_SERVER['REMOTE_ADDR']; } $search_lists = $crawl_model->getMixList($user); foreach ($search_lists as $item) { $data["SEARCH_LISTS"]["m:" . $item["TIMESTAMP"]] = $item["NAME"]; } $n = NUM_RESULTS_PER_PAGE; $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n); if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) { $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page']; } else { $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE; } $locales = $parent->model("locale")->getLocaleList(); $data["LANGUAGES"] = array(); foreach ($locales as $locale) { $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME']; } if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) { $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language']; } else { $data['SOURCE_LOCALE_TAG'] = DEFAULT_LOCALE; } $data["CURRENT_SOURCE"] = array("name" => "", "type" => $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'channel_path' => "", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']); $data["CURRENT_SUBSEARCH"] = array("locale_string" => "", "folder_name" => "", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']); $data['SOURCE_FORM_TYPE'] = "addsource"; $data["SEARCH_FORM_TYPE"] = "addsubsearch"; if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) { switch ($_REQUEST['arg']) { case "addsource": if (!$source_type_flag) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_source_type') . "</h1>');"; break; } $must_have = array("name", "type", 'source_url'); $is_html_feed = false; if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'html') { $is_html_feed = true; $must_have = array_merge($must_have, array('channel_path', 'item_path', 'title_path', 'description_path', 'link_path')); } $to_clean = array_merge($must_have, array('aux_info', 'language')); foreach ($to_clean as $clean_me) { $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if ($clean_me == "source_url") { $r[$clean_me] = UrlParser::canonicalLink($r[$clean_me], NAME_SERVER); echo $r[$clean_me] . "\n"; if (!$r[$clean_me]) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_invalid_url') . "</h1>');"; break 2; } } if (in_array($clean_me, $must_have) && $r[$clean_me] == "") { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');"; break 2; } } if ($is_html_feed) { $r['aux_info'] = $r['channel_path'] . "###" . $r['item_path'] . "###" . $r['title_path'] . "###" . $r['description_path'] . "###" . $r['link_path']; } $source_model->addMediaSource($r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_added') . "</h1>');"; break; case "addsubsearch": $to_clean = array("folder_name", 'index_identifier'); $must_have = $to_clean; foreach ($to_clean as $clean_me) { $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : ""; if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');"; break 2; } } $source_model->addSubsearch($r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_added') . "</h1>');"; break; case "deletesource": if (!isset($_REQUEST['ts'])) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');"; break; } $timestamp = $parent->clean($_REQUEST['ts'], "string"); $source_model->deleteMediaSource($timestamp); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_deleted') . "</h1>');"; break; case "deletesubsearch": if (!isset($_REQUEST['fn'])) { $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');"; break; } $folder_name = $parent->clean($_REQUEST['fn'], "string"); $source_model->deleteSubsearch($folder_name); $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_deleted') . "</h1>');"; break; case "editsubsearch": $data['SEARCH_FORM_TYPE'] = "editsubsearch"; $subsearch = false; $folder_name = isset($_REQUEST['fn']) ? $parent->clean($_REQUEST['fn'], "string") : ""; if ($folder_name) { $subsearch = $source_model->getSubsearch($folder_name); } if (!$subsearch) { $data['SOURCE_FORM_TYPE'] = "addsubsearch"; break; } $data['fn'] = $folder_name; $update = false; foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $subsearch[$upper_field] = $parent->clean($_REQUEST[$field], "string"); $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; $update = true; } else { if (isset($subsearch[$upper_field])) { $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field]; } } } if ($update) { $source_model->updateSubsearch($subsearch); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_updated') . "</h1>');"; } break; case "editsource": $data['SOURCE_FORM_TYPE'] = "editsource"; $source = false; $timestamp = isset($_REQUEST['ts']) ? $parent->clean($_REQUEST['ts'], "string") : ""; if ($timestamp) { $source = $source_model->getMediaSource($timestamp); } if (!$source) { $data['SOURCE_FORM_TYPE'] = "addsource"; break; } $data['ts'] = $timestamp; $update = false; $is_html_feed = false; if ($source['TYPE'] == 'html') { $is_html_feed = true; list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = explode("###", $source['AUX_INFO']); } foreach ($data['CURRENT_SOURCE'] as $field => $value) { $upper_field = strtoupper($field); if (isset($_REQUEST[$field]) && $field != 'name') { $source[$upper_field] = $parent->clean($_REQUEST[$field], "string"); $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; $update = true; } else { if (isset($source[$upper_field])) { $data['CURRENT_SOURCE'][$field] = $source[$upper_field]; } } } if ($update) { if ($is_html_feed) { $source['AUX_INFO'] = $source['CHANNEL_PATH'] . "###" . $source['ITEM_PATH'] . "###" . $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###" . $source['LINK_PATH']; } unset($source['CHANNEL_PATH']); unset($source['ITEM_PATH']); unset($source['TITLE_PATH']); unset($source['DESCRIPTION_PATH']); unset($source['LINK_PATH']); $source_model->updateMediaSource($source); $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_updated') . "</h1>');"; } break; } } $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity($_SESSION['USER_ID'], "manageLocales"); $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("NAME", "", "", "ASC"))); $parent->pagingLogic($data, $source_model, "SUBSEARCHES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("FOLDER_NAME", "", "", "ASC")), "SUB", "SUBSEARCH"); foreach ($data["SUBSEARCHES"] as $search) { if (!isset($data["SEARCH_LISTS"][trim($search['INDEX_IDENTIFIER'])])) { $source_model->deleteSubsearch($search["FOLDER_NAME"]); } } $data['SCRIPT'] .= "source_type = elt('source-type');" . "source_type.onchange = switchSourceType;" . "switchSourceType()"; return $data; }
/** * Parses the contents of a robots.txt page extracting allowed, * disallowed paths, crawl-delay, and sitemaps. We also extract a * list of all user agent strings seen. * * @param string $page text string of a document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description, links, and content) of * the information in $page */ function process($page, $url) { $summary = NULL; $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = ""; $summary[self::LANG] = NULL; $summary[self::ROBOT_PATHS] = array(); $summary[self::AGENT_LIST] = array(); $summary[self::LINKS] = array(); $host_url = UrlParser::getHost($url); $lines = explode("\n", $page); $add_rule_state = false; $rule_added_flag = false; $delay_flag = false; $delay = 0; foreach ($lines as $pre_line) { $pre_line_parts = explode("#", $pre_line); $line = $pre_line_parts[0]; $line_parts = explode(":", $line); if (!isset($line_parts[1])) { continue; } $field = array_shift($line_parts); $value = implode(":", $line_parts); //notice we lower case field, so switch below is case insensitive $field = strtolower(trim($field)); $value = trim($value); $specificness = 0; if (strlen($value) == 0) { continue; } switch ($field) { case "user-agent": //we allow * in user agent string $summary[self::AGENT_LIST][] = $value; $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0; if ($current_specificness < $specificness) { break; } if ($specificness < $current_specificness) { //Give precedence to exact match on agent string $specificness = $current_specificness; $add_rule_state = true; $summary[self::ROBOT_PATHS] = array(); break; } $agent_parts = explode("*", $value); $offset = 0; $add_rule_state = true; foreach ($agent_parts as $part) { if ($part == "") { continue; } $new_offset = stripos(USER_AGENT_SHORT, $part, $offset); if ($new_offset === false) { $add_rule_state = false; break; } $offset = $new_offset; } break; case "sitemap": $tmp_url = UrlParser::canonicalLink($value, $host_url); if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) { $summary[self::LINKS][] = $tmp_url; } break; case "allow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "disallow": if ($add_rule_state) { $rule_added_flag = true; $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value); } break; case "crawl-delay": if ($add_rule_state) { $delay_flag = true; $delay = max($delay, intval($value)); } break; } } if ($delay_flag) { if ($delay > MAXIMUM_CRAWL_DELAY) { $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/"; } else { $summary[self::CRAWL_DELAY] = $delay; } } $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>"; return $summary; }
/** * Processes an array of downloaded web pages with the appropriate page * processor. * * Summary data is extracted from each non robots.txt file in the array. * Disallowed paths and crawl-delays are extracted from robots.txt files. * * @param array $site_pages a collection of web pages to process * @return array summary data extracted from these pages */ function processFetchPages($site_pages) { $PAGE_PROCESSORS = $this->page_processors; crawlLog("Start process pages... Current Memory:" . memory_get_usage()); $start_time = microtime(); $prefix = $this->fetcher_num . "-"; $stored_site_pages = array(); $summarized_site_pages = array(); $num_items = $this->web_archive->count; $i = 0; foreach ($site_pages as $site) { $response_code = $site[self::HTTP_CODE]; $was_error = false; if ($response_code < 200 || $response_code >= 300) { crawlLog($site[self::URL] . " response code {$response_code}"); $host = UrlParser::getHost($site[self::URL]); if (!isset($this->hosts_with_errors[$host])) { $this->hosts_with_errors[$host] = 0; } if ($response_code >= 400 || $response_code < 100) { // < 100 will capture failures to connect which are returned // as strings $was_error = true; $this->hosts_with_errors[$host]++; } /* we print out errors to std output. We still go ahead and process the page. Maybe it is a cool error page, also this makes sure we don't crawl it again */ } // text/robot is my made up mimetype for robots.txt files $was_robot_error = false; if (isset($site[self::ROBOT_PATHS])) { if (!$was_error) { $type = "text/robot"; } else { $type = $site[self::TYPE]; if ($response_code != 404) { /* disallow crawling if robots.txt was any error other that not found */ $was_robot_error = true; $site[self::ROBOT_PATHS][] = "/"; } } } else { if (isset($site[self::FILE_NAME])) { $extension = UrlParser::getDocumentType($site[self::FILE_NAME]); if ($extension == $this->programming_language_extension['java']) { $type = "text/java"; } else { if ($extension == $this->programming_language_extension['py']) { $type = "text/py"; } else { $type = $site[self::TYPE]; } } } else { $type = $site[self::TYPE]; } } $handled = false; /*deals with short URLs and directs them to the original link for robots.txt don't want to introduce stuff that can be mis-parsed (we follow redirects in this case anyway) */ if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) { array_unshift($site[self::LOCATION], $site[self::URL]); $tmp_loc = array_pop($site[self::LOCATION]); $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]); $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc); $doc_info = array(); $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL]; $doc_info[self::LOCATION] = true; $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc; $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION]; $doc_info[self::TITLE] = $site[self::URL]; $text_data = true; if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } $handled = true; } else { if (isset($PAGE_PROCESSORS[$type])) { $page_processor = $PAGE_PROCESSORS[$type]; if (generalIsA($page_processor, "TextProcessor")) { $text_data = true; } else { $text_data = false; } } else { crawlLog("No page processor for mime type: " . $type); crawlLog("Not processing: " . $site[self::URL]); continue; } } if (!$handled) { if (isset($this->plugin_processors[$page_processor])) { $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option); } else { $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option); } } if (isset($site[self::PAGE]) && !$handled) { if (!isset($site[self::ENCODING])) { $site[self::ENCODING] = "UTF-8"; } //if not UTF-8 convert before doing anything else if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) { if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) { crawlLog(" MB_CHECK_ENCODING FAILED!!"); } crawlLog(" Converting from encoding " . $site[self::ENCODING] . "..."); //if HEBREW WINDOWS-1255 use ISO-8859 instead if (stristr($site[self::ENCODING], "1255")) { $site[self::ENCODING] = "ISO-8859-8"; crawlLog(" using encoding " . $site[self::ENCODING] . "..."); } if (stristr($site[self::ENCODING], "1256")) { $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]); crawlLog(" using Yioop hack encoding ..."); } else { $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]); } } crawlLog(" Using Processor..." . $page_processor); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $tmp_url_store = $site[self::URL]; $site[self::URL] = $site[self::FILE_NAME]; } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; } if (!$doc_info) { crawlLog(" Processing Yielded No Data For: " . $site[self::URL]); } if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) { $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time); } } else { if (!$handled) { $doc_info = false; } } $not_loc = true; if ($doc_info) { $site[self::DOC_INFO] = $doc_info; if (isset($doc_info[self::LOCATION])) { $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true); $not_loc = false; } $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE; if (!is_dir(CRAWL_DIR . "/cache")) { mkdir(CRAWL_DIR . "/cache"); $htaccess = "Options None\nphp_flag engine off\n"; file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess); } if ($type == "text/robot" && isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } if ($text_data) { if (isset($doc_info[self::PAGE])) { $site[self::PAGE] = $doc_info[self::PAGE]; } else { $site[self::PAGE] = NULL; } if ($not_loc) { $content = $doc_info[self::DESCRIPTION]; $site[self::HASH] = FetchUrl::computePageHash($content); } } else { $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]); } if (isset($doc_info[self::WORD_CLOUD])) { $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD]; } else { $site[self::WORD_CLOUD] = NULL; } if (isset($doc_info[self::CRAWL_DELAY])) { $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY]; } if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) { $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS]; } if (!isset($site[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array(); } if (isset($doc_info[self::ROBOT_METAS])) { $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]); } //here's where we enforce NOFOLLOW if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) { $site[self::DOC_INFO][self::LINKS] = array(); } if (isset($doc_info[self::AGENT_LIST])) { $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST]; } $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages); $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]); if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME]; } else { $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]); // stripping html to be on the safe side } if (!isset($site[self::REPOSITORY_TYPE])) { if ($was_robot_error) { $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION]; } $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]); } else { $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION]; } if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) { $summarized_site_pages[$i][self::JUST_METAS] = true; } if (isset($site[self::DOC_INFO][self::META_WORDS])) { if (!isset($summarized_site_pages[$i][self::META_WORDS])) { $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS]; } else { $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]); } } if (isset($site[self::DOC_INFO][self::LANG])) { if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") { $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]); } $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG]; } if (isset($site[self::DOC_INFO][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS]; } if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) { $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD]; } if (isset($site[self::DOC_INFO][self::THUMB])) { $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB]; } if (isset($site[self::DOC_INFO][self::SUBDOCS])) { $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages); } if (isset($summarized_site_pages[$i][self::LINKS])) { $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]); } if (!empty($this->classifiers)) { Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers); } if ($this->page_rule_parser != NULL) { $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]); } $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array(); if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) { $stored_site_pages[$i] = false; } $stored_site_pages[$i][self::INDEX] = $i; $i++; } } // end for $num_pages = count($stored_site_pages); $filter_stored = array_filter($stored_site_pages); if ($num_pages > 0 && $this->cache_pages) { $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored); } else { if ($num_pages > 0) { $this->web_archive->addCount(count($filter_stored)); } } for ($i = 0; $i < $num_pages; $i++) { $summarized_site_pages[$i][self::INDEX] = $num_items + $i; } foreach ($filter_stored as $stored) { $i = $stored[self::INDEX]; if (isset($stored[self::OFFSET])) { $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET]; $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition; } } crawlLog(" Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage()); return $summarized_site_pages; }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $sit a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink"; $i = 0; $relationships = $dom->getElementsByTagName("Relationships"); foreach ($relationships as $relationship) { $relations = $relationship->getElementsByTagName("Relationship"); foreach ($relations as $relation) { if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) { if ($i < MAX_LINKS_TO_EXTRACT) { $link = $relation->getAttribute('Target'); $url = UrlParser::canonicalLink($link, $site); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) { if (isset($sites[$url])) { $sites[$url] .= " " . $link; } else { $sites[$url] = $link; } $i++; } } } } } return $sites; }
/** * Returns up to MAX_LINK_PER_PAGE many links from the supplied * dom object where links have been canonicalized according to * the supplied $site information. * * @param object $dom a document object with links on it * @param string $site a string containing a url * * @return array links from the $dom object */ static function links($dom, $site) { $sites = array(); $xpath = new DOMXPath($dom); $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n p:txBody//a:p//a:r//a:rPr//a:hlinkClick"); $i = 0; foreach ($paras as $para) { if ($i < MAX_LINKS_TO_EXTRACT) { $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue; $url = UrlParser::canonicalLink($hlink, $site); $len = strlen($url); if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) { if (isset($sites[$url])) { $sites[$url] .= " " . $hlink; } else { $sites[$url] = $hlink; } } } $i++; } return $sites; }
/** * Make relative links canonical with respect to provided $url * for links appear within the Dom node. * * @param object $node dom node to fix links for * @param string $url url to use to canonicalize links * @return object updated dom node */ function canonicalizeLinks($node, $url) { if (!isset($node->childNodes->length) || get_class($node) != 'DOMElement') { return $node; } for ($k = 0; $k < $node->childNodes->length; $k++) { if (!$node->childNodes->item($k)) { break; } $clone = $node->childNodes->item($k)->cloneNode(true); $tag_name = isset($clone->tagName) ? $clone->tagName : "-1"; if (in_array($tag_name, array("a", "link"))) { if ($clone->hasAttribute("href")) { $href = $clone->getAttribute("href"); if ($href != "" && $href[0] != "#") { $href = UrlParser::canonicalLink($href, $url, false); } /* Modify non-link tag urls so that they are looked up in the cache before going to the live site */ if ($tag_name != "link" && ($href == "" || $href[0] != "#")) { $href = urlencode($href); $href = $href . "&from_cache=true"; $crawl_time = $this->getIndexTimestamp(); $href = $this->baseLink() . "&a=cache&q&arg" . "={$href}&its={$crawl_time}"; } $clone->setAttribute("href", $href); //an anchor might have an img tag within it so recurses $clone = $this->canonicalizeLinks($clone, $url); $node->replaceChild($clone, $node->childNodes->item($k)); } } else { if (in_array($tag_name, array("img", "object", "script"))) { if ($clone->hasAttribute("src")) { $src = $clone->getAttribute("src"); $src = UrlParser::canonicalLink($src, $url, false); $clone->setAttribute("src", $src); $node->replaceChild($clone, $node->childNodes->item($k)); } } else { if ($tag_name != -1) { $clone = $this->canonicalizeLinks($clone, $url); if (is_object($clone)) { $node->replaceChild($clone, $node->childNodes->item($k)); } } } } } return $node; }
/** * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems * For each feed source downloads the feeds, checks which items are * not in the database, adds them. This method does not update * the inverted index shard. * * @param array $feeds list of feeds to download * @param int $age how many seconds old records should be ignored */ function updateFeedItemsOneGo($feeds, $age = ONE_WEEK) { $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true); $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?"; foreach ($feeds as $feed) { $is_html = $feed['TYPE'] == 'html' ? true : false; crawlLog("Updating {$feed['NAME']}. Making dom object from feed."); if (!$feed[CrawlConstants::PAGE]) { crawlLog("...No data in feed skipping."); continue; } $dom = new DOMDocument(); if ($is_html) { @$dom->loadHTML($feed[CrawlConstants::PAGE]); } else { @$dom->loadXML($feed[CrawlConstants::PAGE]); } crawlLog("...done. Extracting info about whole feed."); $lang = ""; if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") { $languages = $dom->getElementsByTagName('language'); if ($languages && is_object($languages) && is_object($languages->item(0))) { $lang = $languages->item(0)->textContent; $db->execute($sql, array($lang, $feed['TIMESTAMP'])); } } else { if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") { $lang = $feed["LANGUAGE"]; } else { $lang = DEFAULT_LOCALE; } } crawlLog("...Language is {$lang}. Getting channel, finding nodes."); if ($is_html) { $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']); if (!$sub_dom) { crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed."); continue; } else { crawlLog("...Channel scraped."); } $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']); $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']); } else { $nodes = $dom->getElementsByTagName('item'); $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate"); if ($nodes->length == 0) { // maybe we're dealing with atom rather than rss $nodes = $dom->getElementsByTagName('entry'); $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated"); } } crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}."); $num_added = 0; $num_seen = 0; foreach ($nodes as $node) { $item = array(); foreach ($rss_elements as $db_element => $feed_element) { crawlTimeoutLog("..still adding feed items to index."); if ($is_html) { $tag_nodes = $this->getTags($node, $feed_element); if (!isset($tag_nodes[0])) { $tag_node = NULL; } else { $tag_node = $tag_nodes[0]; } $element_text = is_object($tag_node) ? $tag_node->textContent : ""; } else { $tag_node = $node->getElementsByTagName($feed_element)->item(0); $element_text = is_object($tag_node) ? $tag_node->nodeValue : ""; } if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) { if ($is_html) { $element_text = $tag_node->documentElement->getAttribute("href"); } else { $element_text = $tag_node->getAttribute("href"); } $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]); } $item[$db_element] = strip_tags($element_text); } $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age); if ($did_add) { $num_added++; } $num_seen++; } crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}."); } }
/** * Returns a url text pair where the url comes from the link of * the given item node and the text comes from the text data for that node. * urls are canonicalized according to site. * * @param object $item_node the DOMNode to get a link and text from * @param string $link_name name of link tag * @param string $text_name name of text tag to associate with link * @param string $site a string containing a url * @param bool $atom if the feed is atom or rss * * @return array a url,text pair */ static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false) { foreach ($item_node->childNodes as $node) { if ($node->nodeName == $link_name) { if (!$atom) { $url = UrlParser::canonicalLink($node->textContent, $site); } else { $url = UrlParser::canonicalLink($node->getAttribute("href"), $site); } if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) { return false; } } if ($node->nodeName == $text_name) { $text = $node->textContent; if ($text == "") { $text = "RSS Feed"; if ($atom) { $text = "Atom Feed"; } } } } if (!isset($url) || $url == "") { return false; } $text = mb_ereg_replace("(\\s)+", " ", $text); return array($url, $text); }