Example #1
0
 /**
  * Check if can go from a relative link, base link to a complete link
  * in various different ways
  */
 function canonicalLinkTestCase()
 {
     $test_links = array(array(".", "http://www.example.com/", "http://www.example.com/", "root dir0"), array("/bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir1"), array("bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir2"), array("bob", "http://www.example.com/", "http://www.example.com/bob", "root dir3"), array("bob", "http://www.example.com", "http://www.example.com/bob", "root dir4"), array("http://print.bob.com/bob", "http://www.example.com", "http://print.bob.com/bob", "root dir5"), array("/.", "http://www.example.com/", "http://www.example.com/", "root dir6"), array("//slashdot.org", "http://www.slashdot.org", "http://slashdot.org/", "slashdot dir"), array("bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir1"), array("../bob", "http://www.example.com/a", "http://www.example.com/bob", "sub dir2"), array("../../bob", "http://www.example.com/a", NULL, "sub dir3"), array("./bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir4"), array("bob.html?a=1", "http://www.example.com/a", "http://www.example.com/a/bob.html?a=1", "query 1"), array("bob?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/bob?a=1&b=2", "query 2"), array("/?a=1&b=2", "http://www.example.com/a", "http://www.example.com/?a=1&b=2", "query 3"), array("?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/?a=1&b=2", "query 4"), array("b/b.html?a=1&b=2", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2", "query 5"), array("b/b.html?a=1&b=2?c=4", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2?c=4", "query 6"), array("b#1", "http://www.example.com/", "http://www.example.com/b#1", "fragment 1"), array("b?a=1#1", "http://www.example.com/", "http://www.example.com/b?a=1#1", "fragment 2"), array("b?a=1#1#2", "http://www.example.com/", "http://www.example.com/b?a=1#1#2", "fragment 3"), array("#a", "http://www.example.com/c:d", "http://www.example.com/c:d#a", "fragment 4"));
     foreach ($test_links as $test_link) {
         $result = UrlParser::canonicalLink($test_link[0], $test_link[1], false);
         $this->assertEqual($result, $test_link[2], $test_link[3]);
     }
 }
Example #2
0
 /**
  * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $base_refs = $xpath->evaluate("/html//base");
     if ($base_refs->item(0)) {
         $tmp_site = $base_refs->item(0)->getAttribute('href');
         if (strlen($tmp_site) > 0) {
             $site = UrlParser::canonicalLink($tmp_site, $site);
         }
     }
     $i = 0;
     $hrefs = $xpath->evaluate("/html/body//a");
     foreach ($hrefs as $href) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $rel = $href->getAttribute("rel");
             if ($rel == "" || !stristr($rel, "nofollow")) {
                 $url = UrlParser::canonicalLink($href->getAttribute('href'), $site);
                 $len = strlen($url);
                 if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                     $text = $href->nodeValue;
                     if (isset($sites[$url])) {
                         $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     } else {
                         $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     }
                     $i++;
                 }
             }
         }
     }
     $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
     foreach ($frames as $frame) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. HTMLframe";
                 } else {
                     $sites[$url] = "HTMLframe";
                 }
                 $i++;
             }
         }
     }
     $imgs = $xpath->evaluate("/html/body//img[@alt]");
     $i = 0;
     foreach ($imgs as $img) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $alt = $img->getAttribute('alt');
             if (strlen($alt) < 1) {
                 continue;
             }
             $url = UrlParser::canonicalLink($img->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. " . $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 } else {
                     $sites[$url] = $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 }
                 $i++;
             }
         }
     }
     return $sites;
 }
Example #3
0
 /**
  * Handles admin request related to the managing the machines which perform
  * crawls
  *
  * With this activity an admin can add/delete machines to manage. For each
  * managed machine, the admin can stop and start fetchers/queue_servers
  * as well as look at their log files
  *
  * @return array $data MACHINES, their MACHINE_NAMES, data for
  *     FETCHER_NUMBERS drop-down
  */
 function manageMachines()
 {
     $parent = $this->parent;
     $machine_model = $parent->model("machine");
     $profile_model = $parent->model("profile");
     $data = array();
     $data["ELEMENT"] = "managemachines";
     $possible_arguments = array("addmachine", "deletemachine", "newsmode", "log", "update");
     $data['SCRIPT'] = "doUpdate();";
     $data["leftorright"] = getLocaleDirection() == 'ltr' ? "right" : "left";
     $data['MACHINE_NAMES'] = array();
     $data['FETCHER_NUMBERS'] = array(0 => 0, 1 => 1, 2 => 2, 3 => 3, 4 => 4, 5 => 5, 6 => 6, 7 => 7, 8 => 8, 16 => 16);
     $tmp = tl('system_component_select_machine');
     if (!isset($_REQUEST["has_queue_server"]) || isset($_REQUEST['is_replica'])) {
         $_REQUEST["has_queue_server"] = false;
     }
     if (isset($_REQUEST['is_replica'])) {
         $_REQUEST['num_fetchers'] = 0;
     } else {
         $_REQUEST['parent'] = "";
     }
     $request_fields = array("name" => "string", "url" => "string", "has_queue_server" => "bool", "num_fetchers" => "int", "parent" => "string");
     $r = array();
     $allset = true;
     foreach ($request_fields as $field => $type) {
         if (isset($_REQUEST[$field])) {
             $r[$field] = $parent->clean($_REQUEST[$field], $type);
             if ($type == "string") {
                 $r[$field] = trim($r[$field]);
                 if ($r[$field] == "" && $field != "parent") {
                     $allset = false;
                 }
             }
             if ($field == "url") {
                 if (isset($r[$field][strlen($r[$field]) - 1]) && $r[$field][strlen($r[$field]) - 1] != "/") {
                     $r[$field] .= "/";
                 }
                 $r[$field] = UrlParser::canonicalLink($r[$field], NAME_SERVER);
                 if (!$r[$field]) {
                     $allset = false;
                 }
             }
         } else {
             $allset = false;
         }
     }
     if (isset($r["num_fetchers"]) && in_array($r["num_fetchers"], $data['FETCHER_NUMBERS'])) {
         $data['FETCHER_NUMBER'] = $r["num_fetchers"];
     } else {
         $data['FETCHER_NUMBER'] = 0;
         if (isset($r["num_fetchers"])) {
             $r["num_fetchers"] = 0;
         }
     }
     $machine_exists = isset($r["name"]) && $machine_model->checkMachineExists("NAME", $r["name"]) || isset($r["url"]) && $machine_model->checkMachineExists("URL", $r["url"]);
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addmachine":
                 if ($allset == true && !$machine_exists) {
                     $machine_model->addMachine($r["name"], $r["url"], $r["has_queue_server"], $r["num_fetchers"], $r["parent"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_added') . "</h1>');";
                     $data['MACHINE_NAMES'][] = $r["name"];
                     $data['DELETABLE_MACHINES'][$r["name"]] = $r["name"];
                     sort($data['MACHINE_NAMES']);
                 } else {
                     if ($allset && $machine_exists) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_exists') . "</h1>');";
                     } else {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_incomplete') . "</h1>');";
                     }
                 }
                 break;
             case "deletemachine":
                 if (!$machine_exists) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_doesnt_exists') . "</h1>');";
                 } else {
                     $machines = $machine_model->getRows(0, 1, $total_rows, array(array("name", "=", $r["name"], "")));
                     $service_in_use = false;
                     foreach ($machines as $machine) {
                         if ($machine['NAME'] == $r["name"]) {
                             if (isset($machine['STATUSES']) && is_array($machine['STATUSES']) && $machine['STATUSES'] != array()) {
                                 $service_in_use = true;
                                 break;
                             } else {
                                 break;
                             }
                         }
                     }
                     if ($service_in_use) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_stop_service_first') . "</h1>');";
                         break;
                     }
                     $machine_model->deleteMachine($r["name"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_deleted') . "</h1>');";
                 }
                 break;
             case "newsmode":
                 $profile = $profile_model->getProfile(WORK_DIRECTORY);
                 $news_modes = array("news_off", "news_web", "news_process");
                 if (isset($_REQUEST['news_mode']) && in_array($_REQUEST['news_mode'], $news_modes)) {
                     $profile["NEWS_MODE"] = $_REQUEST['news_mode'];
                     if ($profile["NEWS_MODE"] != "news_process") {
                         CrawlDaemon::stop("news_updater", "", false);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     } else {
                         CrawlDaemon::start("news_updater", 'none', "", -1);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     }
                     $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_update_failed') . "</h1>');";
                 }
                 break;
             case "log":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 }
                 if (isset($_REQUEST["mirror_name"])) {
                     $r["mirror_name"] = $parent->clean($_REQUEST["mirror_name"], "string");
                 }
                 if (isset($_REQUEST["time"])) {
                     $data["time"] = $parent->clean($_REQUEST["time"], "int") + 30;
                 } else {
                     $data["time"] = 30;
                 }
                 if (isset($_REQUEST["NO_REFRESH"])) {
                     $data["NO_REFRESH"] = $parent->clean($_REQUEST["NO_REFRESH"], "bool");
                 } else {
                     $data["NO_REFRESH"] = false;
                 }
                 $data["ELEMENT"] = "machinelog";
                 $filter = "";
                 if (isset($_REQUEST['f'])) {
                     $filter = $parent->clean($_REQUEST['f'], "string");
                 }
                 $data['filter'] = $filter;
                 $data["REFRESH_LOG"] = "&time=" . $data["time"];
                 $data["LOG_TYPE"] = "";
                 if (isset($r['fetcher_num']) && isset($r['name'])) {
                     $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], $r["fetcher_num"], $filter);
                     $data["LOG_TYPE"] = $r['name'] . " fetcher " . $r["fetcher_num"];
                     $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'] . "&fetcher_num=" . $r['fetcher_num'];
                 } else {
                     if (isset($r["mirror_name"])) {
                         $data["LOG_TYPE"] = $r['mirror_name'] . " mirror";
                         $data["LOG_FILE_DATA"] = $machine_model->getLog($r["mirror_name"], NULL, $filter, true);
                     } else {
                         if (isset($r['name'])) {
                             $data["LOG_TYPE"] = $r['name'] . " queue_server";
                             if ($r['name'] == "news") {
                                 $data["LOG_TYPE"] = "Name Server News Updater";
                             }
                             $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], NULL, $filter);
                             $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'];
                         }
                     }
                 }
                 if ($data["time"] >= ONE_HOUR / 3) {
                     $data["REFRESH_LOG"] = "";
                 }
                 if (!isset($data["LOG_FILE_DATA"]) || $data["LOG_FILE_DATA"] == "") {
                     $data["LOG_FILE_DATA"] = tl('system_component_no_machine_log');
                 }
                 $lines = array_reverse(explode("\n", $data["LOG_FILE_DATA"]));
                 $data["LOG_FILE_DATA"] = implode("\n", $lines);
                 break;
             case "update":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 } else {
                     $r["fetcher_num"] = NULL;
                 }
                 $available_actions = array("start", "stop", "mirror_start", "mirror_stop");
                 if (isset($r["name"]) && isset($_REQUEST["action"]) && in_array($_REQUEST["action"], $available_actions)) {
                     $action = $_REQUEST["action"];
                     $is_mirror = false;
                     if ($action == "mirror_start") {
                         $action = "start";
                         $is_mirror = true;
                     } else {
                         if ($action == "mirror_stop") {
                             $action = "stop";
                             $is_mirror = true;
                         }
                     }
                     $machine_model->update($r["name"], $action, $r["fetcher_num"], $is_mirror);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_servers_updated') . "</h1>');";
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_no_action') . "</h1>');";
                 }
                 break;
         }
     }
     $parent->pagingLogic($data, $machine_model, "MACHINE", DEFAULT_ADMIN_PAGING_NUM);
     if (!isset($_REQUEST['arg']) || $_REQUEST['arg'] != 'log') {
         $data['SCRIPT'] .= "toggleReplica(false);";
     }
     return $data;
 }
Example #4
0
 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }
Example #5
0
 /**
  * Handles admin request related to the search sources activity
  *
  * The search sources activity allows a user to add/delete search sources
  * for video and news, it also allows a user to control which subsearches
  * appear on the SearchView page
  *
  * @return array $data info about current search sources, and current
  *     sub-searches
  */
 function searchSources()
 {
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $source_model = $parent->model("source");
     $possible_arguments = array("addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch");
     $data = array();
     $data["ELEMENT"] = "searchsources";
     $data['SCRIPT'] = "";
     $data['SOURCE_TYPES'] = array(-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "html" => tl('crawl_component_html_feed'));
     $source_type_flag = false;
     if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) {
         $data['SOURCE_TYPE'] = $_REQUEST['type'];
         $source_type_flag = true;
     } else {
         $data['SOURCE_TYPE'] = -1;
     }
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data["SEARCH_LISTS"] = array(-1 => tl('crawl_component_sources_indexes'));
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["i:" . $item["CRAWL_TIME"]] = $item["DESCRIPTION"];
     }
     if (isset($_SESSION['USER_ID'])) {
         $user = $_SESSION['USER_ID'];
     } else {
         $user = $_SERVER['REMOTE_ADDR'];
     }
     $search_lists = $crawl_model->getMixList($user);
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["m:" . $item["TIMESTAMP"]] = $item["NAME"];
     }
     $n = NUM_RESULTS_PER_PAGE;
     $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n);
     if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) {
         $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page'];
     } else {
         $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE;
     }
     $locales = $parent->model("locale")->getLocaleList();
     $data["LANGUAGES"] = array();
     foreach ($locales as $locale) {
         $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME'];
     }
     if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) {
         $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language'];
     } else {
         $data['SOURCE_LOCALE_TAG'] = DEFAULT_LOCALE;
     }
     $data["CURRENT_SOURCE"] = array("name" => "", "type" => $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'channel_path' => "", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']);
     $data["CURRENT_SUBSEARCH"] = array("locale_string" => "", "folder_name" => "", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']);
     $data['SOURCE_FORM_TYPE'] = "addsource";
     $data["SEARCH_FORM_TYPE"] = "addsubsearch";
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addsource":
                 if (!$source_type_flag) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_source_type') . "</h1>');";
                     break;
                 }
                 $must_have = array("name", "type", 'source_url');
                 $is_html_feed = false;
                 if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'html') {
                     $is_html_feed = true;
                     $must_have = array_merge($must_have, array('channel_path', 'item_path', 'title_path', 'description_path', 'link_path'));
                 }
                 $to_clean = array_merge($must_have, array('aux_info', 'language'));
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if ($clean_me == "source_url") {
                         $r[$clean_me] = UrlParser::canonicalLink($r[$clean_me], NAME_SERVER);
                         echo $r[$clean_me] . "\n";
                         if (!$r[$clean_me]) {
                             $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_invalid_url') . "</h1>');";
                             break 2;
                         }
                     }
                     if (in_array($clean_me, $must_have) && $r[$clean_me] == "") {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 if ($is_html_feed) {
                     $r['aux_info'] = $r['channel_path'] . "###" . $r['item_path'] . "###" . $r['title_path'] . "###" . $r['description_path'] . "###" . $r['link_path'];
                 }
                 $source_model->addMediaSource($r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_added') . "</h1>');";
                 break;
             case "addsubsearch":
                 $to_clean = array("folder_name", 'index_identifier');
                 $must_have = $to_clean;
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 $source_model->addSubsearch($r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_added') . "</h1>');";
                 break;
             case "deletesource":
                 if (!isset($_REQUEST['ts'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $timestamp = $parent->clean($_REQUEST['ts'], "string");
                 $source_model->deleteMediaSource($timestamp);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_deleted') . "</h1>');";
                 break;
             case "deletesubsearch":
                 if (!isset($_REQUEST['fn'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $folder_name = $parent->clean($_REQUEST['fn'], "string");
                 $source_model->deleteSubsearch($folder_name);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_deleted') . "</h1>');";
                 break;
             case "editsubsearch":
                 $data['SEARCH_FORM_TYPE'] = "editsubsearch";
                 $subsearch = false;
                 $folder_name = isset($_REQUEST['fn']) ? $parent->clean($_REQUEST['fn'], "string") : "";
                 if ($folder_name) {
                     $subsearch = $source_model->getSubsearch($folder_name);
                 }
                 if (!$subsearch) {
                     $data['SOURCE_FORM_TYPE'] = "addsubsearch";
                     break;
                 }
                 $data['fn'] = $folder_name;
                 $update = false;
                 foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $subsearch[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         $update = true;
                     } else {
                         if (isset($subsearch[$upper_field])) {
                             $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     $source_model->updateSubsearch($subsearch);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_updated') . "</h1>');";
                 }
                 break;
             case "editsource":
                 $data['SOURCE_FORM_TYPE'] = "editsource";
                 $source = false;
                 $timestamp = isset($_REQUEST['ts']) ? $parent->clean($_REQUEST['ts'], "string") : "";
                 if ($timestamp) {
                     $source = $source_model->getMediaSource($timestamp);
                 }
                 if (!$source) {
                     $data['SOURCE_FORM_TYPE'] = "addsource";
                     break;
                 }
                 $data['ts'] = $timestamp;
                 $update = false;
                 $is_html_feed = false;
                 if ($source['TYPE'] == 'html') {
                     $is_html_feed = true;
                     list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = explode("###", $source['AUX_INFO']);
                 }
                 foreach ($data['CURRENT_SOURCE'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $source[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         $update = true;
                     } else {
                         if (isset($source[$upper_field])) {
                             $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     if ($is_html_feed) {
                         $source['AUX_INFO'] = $source['CHANNEL_PATH'] . "###" . $source['ITEM_PATH'] . "###" . $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###" . $source['LINK_PATH'];
                     }
                     unset($source['CHANNEL_PATH']);
                     unset($source['ITEM_PATH']);
                     unset($source['TITLE_PATH']);
                     unset($source['DESCRIPTION_PATH']);
                     unset($source['LINK_PATH']);
                     $source_model->updateMediaSource($source);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_updated') . "</h1>');";
                 }
                 break;
         }
     }
     $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity($_SESSION['USER_ID'], "manageLocales");
     $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("NAME", "", "", "ASC")));
     $parent->pagingLogic($data, $source_model, "SUBSEARCHES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("FOLDER_NAME", "", "", "ASC")), "SUB", "SUBSEARCH");
     foreach ($data["SUBSEARCHES"] as $search) {
         if (!isset($data["SEARCH_LISTS"][trim($search['INDEX_IDENTIFIER'])])) {
             $source_model->deleteSubsearch($search["FOLDER_NAME"]);
         }
     }
     $data['SCRIPT'] .= "source_type = elt('source-type');" . "source_type.onchange = switchSourceType;" . "switchSourceType()";
     return $data;
 }
Example #6
0
 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }
Example #7
0
 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }
Example #8
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $sit  a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink";
     $i = 0;
     $relationships = $dom->getElementsByTagName("Relationships");
     foreach ($relationships as $relationship) {
         $relations = $relationship->getElementsByTagName("Relationship");
         foreach ($relations as $relation) {
             if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) {
                 if ($i < MAX_LINKS_TO_EXTRACT) {
                     $link = $relation->getAttribute('Target');
                     $url = UrlParser::canonicalLink($link, $site);
                     if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) {
                         if (isset($sites[$url])) {
                             $sites[$url] .= " " . $link;
                         } else {
                             $sites[$url] = $link;
                         }
                         $i++;
                     }
                 }
             }
         }
     }
     return $sites;
 }
Example #9
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $site a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
     $i = 0;
     foreach ($paras as $para) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue;
             $url = UrlParser::canonicalLink($hlink, $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " " . $hlink;
                 } else {
                     $sites[$url] = $hlink;
                 }
             }
         }
         $i++;
     }
     return $sites;
 }
Example #10
0
 /**
  * Make relative links canonical with respect to provided $url
  * for links appear within the Dom node.
  *
  * @param object $node dom node to fix links for
  * @param string $url url to use to canonicalize links
  * @return object updated dom node
  */
 function canonicalizeLinks($node, $url)
 {
     if (!isset($node->childNodes->length) || get_class($node) != 'DOMElement') {
         return $node;
     }
     for ($k = 0; $k < $node->childNodes->length; $k++) {
         if (!$node->childNodes->item($k)) {
             break;
         }
         $clone = $node->childNodes->item($k)->cloneNode(true);
         $tag_name = isset($clone->tagName) ? $clone->tagName : "-1";
         if (in_array($tag_name, array("a", "link"))) {
             if ($clone->hasAttribute("href")) {
                 $href = $clone->getAttribute("href");
                 if ($href != "" && $href[0] != "#") {
                     $href = UrlParser::canonicalLink($href, $url, false);
                 }
                 /*
                    Modify non-link tag urls so that they are looked up in
                    the cache before going to the live site
                 */
                 if ($tag_name != "link" && ($href == "" || $href[0] != "#")) {
                     $href = urlencode($href);
                     $href = $href . "&from_cache=true";
                     $crawl_time = $this->getIndexTimestamp();
                     $href = $this->baseLink() . "&a=cache&q&arg" . "={$href}&its={$crawl_time}";
                 }
                 $clone->setAttribute("href", $href);
                 //an anchor might have an img tag within it so recurses
                 $clone = $this->canonicalizeLinks($clone, $url);
                 $node->replaceChild($clone, $node->childNodes->item($k));
             }
         } else {
             if (in_array($tag_name, array("img", "object", "script"))) {
                 if ($clone->hasAttribute("src")) {
                     $src = $clone->getAttribute("src");
                     $src = UrlParser::canonicalLink($src, $url, false);
                     $clone->setAttribute("src", $src);
                     $node->replaceChild($clone, $node->childNodes->item($k));
                 }
             } else {
                 if ($tag_name != -1) {
                     $clone = $this->canonicalizeLinks($clone, $url);
                     if (is_object($clone)) {
                         $node->replaceChild($clone, $node->childNodes->item($k));
                     }
                 }
             }
         }
     }
     return $node;
 }
Example #11
0
 /**
  * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems
  * For each feed source downloads the feeds, checks which items are
  * not in the database, adds them. This method does not update
  * the inverted index shard.
  *
  * @param array $feeds list of feeds to download
  * @param int $age how many seconds old records should be ignored
  */
 function updateFeedItemsOneGo($feeds, $age = ONE_WEEK)
 {
     $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true);
     $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?";
     foreach ($feeds as $feed) {
         $is_html = $feed['TYPE'] == 'html' ? true : false;
         crawlLog("Updating {$feed['NAME']}. Making dom object from feed.");
         if (!$feed[CrawlConstants::PAGE]) {
             crawlLog("...No data in feed skipping.");
             continue;
         }
         $dom = new DOMDocument();
         if ($is_html) {
             @$dom->loadHTML($feed[CrawlConstants::PAGE]);
         } else {
             @$dom->loadXML($feed[CrawlConstants::PAGE]);
         }
         crawlLog("...done. Extracting info about whole feed.");
         $lang = "";
         if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") {
             $languages = $dom->getElementsByTagName('language');
             if ($languages && is_object($languages) && is_object($languages->item(0))) {
                 $lang = $languages->item(0)->textContent;
                 $db->execute($sql, array($lang, $feed['TIMESTAMP']));
             }
         } else {
             if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") {
                 $lang = $feed["LANGUAGE"];
             } else {
                 $lang = DEFAULT_LOCALE;
             }
         }
         crawlLog("...Language is {$lang}. Getting channel, finding nodes.");
         if ($is_html) {
             $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']);
             if (!$sub_dom) {
                 crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed.");
                 continue;
             } else {
                 crawlLog("...Channel scraped.");
             }
             $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']);
             $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']);
         } else {
             $nodes = $dom->getElementsByTagName('item');
             $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate");
             if ($nodes->length == 0) {
                 // maybe we're dealing with atom rather than rss
                 $nodes = $dom->getElementsByTagName('entry');
                 $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated");
             }
         }
         crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}.");
         $num_added = 0;
         $num_seen = 0;
         foreach ($nodes as $node) {
             $item = array();
             foreach ($rss_elements as $db_element => $feed_element) {
                 crawlTimeoutLog("..still adding feed items to index.");
                 if ($is_html) {
                     $tag_nodes = $this->getTags($node, $feed_element);
                     if (!isset($tag_nodes[0])) {
                         $tag_node = NULL;
                     } else {
                         $tag_node = $tag_nodes[0];
                     }
                     $element_text = is_object($tag_node) ? $tag_node->textContent : "";
                 } else {
                     $tag_node = $node->getElementsByTagName($feed_element)->item(0);
                     $element_text = is_object($tag_node) ? $tag_node->nodeValue : "";
                 }
                 if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) {
                     if ($is_html) {
                         $element_text = $tag_node->documentElement->getAttribute("href");
                     } else {
                         $element_text = $tag_node->getAttribute("href");
                     }
                     $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]);
                 }
                 $item[$db_element] = strip_tags($element_text);
             }
             $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age);
             if ($did_add) {
                 $num_added++;
             }
             $num_seen++;
         }
         crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}.");
     }
 }
Example #12
0
 /**
  * Returns a url text pair where the url comes from the link of
  * the given item node and the text comes from the text data for that node.
  * urls are canonicalized according to site.
  *
  * @param object $item_node the DOMNode to get a link and text from
  * @param string $link_name name of link tag
  * @param string $text_name name of text tag to associate with link
  * @param string $site   a string containing a url
  * @param bool $atom if the feed is atom or rss
  *
  * @return array a url,text pair
  */
 static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false)
 {
     foreach ($item_node->childNodes as $node) {
         if ($node->nodeName == $link_name) {
             if (!$atom) {
                 $url = UrlParser::canonicalLink($node->textContent, $site);
             } else {
                 $url = UrlParser::canonicalLink($node->getAttribute("href"), $site);
             }
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) {
                 return false;
             }
         }
         if ($node->nodeName == $text_name) {
             $text = $node->textContent;
             if ($text == "") {
                 $text = "RSS Feed";
                 if ($atom) {
                     $text = "Atom Feed";
                 }
             }
         }
     }
     if (!isset($url) || $url == "") {
         return false;
     }
     $text = mb_ereg_replace("(\\s)+", " ", $text);
     return array($url, $text);
 }