PHP UrlParser::canonicalLink Examples

Programming Language: PHP

Class/Type: UrlParser

Method/Function: canonicalLink

Examples at hotexamples.com: 12

PHP UrlParser::canonicalLink - 12 examples found. These are the top rated real world PHP examples of UrlParser::canonicalLink extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

canonicalLink(12)

getHost(11)

getDocumentType(7)

checkRecursiveUrl(6)

getDocumentFilename(5)

getPath(4)

urlMemberSiteArray(4)

getScheme(4)

getWordsLastPathPartUrl(3)

getWordsIfHostUrl(3)

getPathArray(3)

isLocalhostUrl(3)

isPathMemberRegexPaths(3)

simplifyUrl(3)

getHostAndPath(2)

pruneLinks(2)

guessMimeTypeFromFileName(2)

getHostSubdomains(2)

parse(1)

parseUrl(1)

isVideoUrl(1)

getPort(1)

isFollowUrl(1)

getLang(1)

getHostPaths(1)

getCourseDirName(1)

defaultFilter(1)

cleanRedundantLinks(1)

urlParse(1)

Example #1

Show file

File: url_parser_test.php Project: yakar/yioop

 /**
  * Check if can go from a relative link, base link to a complete link
  * in various different ways
  */
 function canonicalLinkTestCase()
 {
     $test_links = array(array(".", "http://www.example.com/", "http://www.example.com/", "root dir0"), array("/bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir1"), array("bob.html", "http://www.example.com/", "http://www.example.com/bob.html", "root dir2"), array("bob", "http://www.example.com/", "http://www.example.com/bob", "root dir3"), array("bob", "http://www.example.com", "http://www.example.com/bob", "root dir4"), array("http://print.bob.com/bob", "http://www.example.com", "http://print.bob.com/bob", "root dir5"), array("/.", "http://www.example.com/", "http://www.example.com/", "root dir6"), array("//slashdot.org", "http://www.slashdot.org", "http://slashdot.org/", "slashdot dir"), array("bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir1"), array("../bob", "http://www.example.com/a", "http://www.example.com/bob", "sub dir2"), array("../../bob", "http://www.example.com/a", NULL, "sub dir3"), array("./bob", "http://www.example.com/a", "http://www.example.com/a/bob", "sub dir4"), array("bob.html?a=1", "http://www.example.com/a", "http://www.example.com/a/bob.html?a=1", "query 1"), array("bob?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/bob?a=1&b=2", "query 2"), array("/?a=1&b=2", "http://www.example.com/a", "http://www.example.com/?a=1&b=2", "query 3"), array("?a=1&b=2", "http://www.example.com/a", "http://www.example.com/a/?a=1&b=2", "query 4"), array("b/b.html?a=1&b=2", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2", "query 5"), array("b/b.html?a=1&b=2?c=4", "http://www.example.com/a/c", "http://www.example.com/a/c/b/b.html?a=1&b=2?c=4", "query 6"), array("b#1", "http://www.example.com/", "http://www.example.com/b#1", "fragment 1"), array("b?a=1#1", "http://www.example.com/", "http://www.example.com/b?a=1#1", "fragment 2"), array("b?a=1#1#2", "http://www.example.com/", "http://www.example.com/b?a=1#1#2", "fragment 3"), array("#a", "http://www.example.com/c:d", "http://www.example.com/c:d#a", "fragment 4"));
     foreach ($test_links as $test_link) {
         $result = UrlParser::canonicalLink($test_link[0], $test_link[1], false);
         $this->assertEqual($result, $test_link[2], $test_link[3]);
     }
 }

Example #2

Show file

File: html_processor.php Project: yakar/yioop

 /**
  * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $base_refs = $xpath->evaluate("/html//base");
     if ($base_refs->item(0)) {
         $tmp_site = $base_refs->item(0)->getAttribute('href');
         if (strlen($tmp_site) > 0) {
             $site = UrlParser::canonicalLink($tmp_site, $site);
         }
     }
     $i = 0;
     $hrefs = $xpath->evaluate("/html/body//a");
     foreach ($hrefs as $href) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $rel = $href->getAttribute("rel");
             if ($rel == "" || !stristr($rel, "nofollow")) {
                 $url = UrlParser::canonicalLink($href->getAttribute('href'), $site);
                 $len = strlen($url);
                 if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                     $text = $href->nodeValue;
                     if (isset($sites[$url])) {
                         $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     } else {
                         $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     }
                     $i++;
                 }
             }
         }
     }
     $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
     foreach ($frames as $frame) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. HTMLframe";
                 } else {
                     $sites[$url] = "HTMLframe";
                 }
                 $i++;
             }
         }
     }
     $imgs = $xpath->evaluate("/html/body//img[@alt]");
     $i = 0;
     foreach ($imgs as $img) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $alt = $img->getAttribute('alt');
             if (strlen($alt) < 1) {
                 continue;
             }
             $url = UrlParser::canonicalLink($img->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. " . $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 } else {
                     $sites[$url] = $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 }
                 $i++;
             }
         }
     }
     return $sites;
 }

Example #3

Show file

File: system_component.php Project: yakar/yioop

 /**
  * Handles admin request related to the managing the machines which perform
  * crawls
  *
  * With this activity an admin can add/delete machines to manage. For each
  * managed machine, the admin can stop and start fetchers/queue_servers
  * as well as look at their log files
  *
  * @return array $data MACHINES, their MACHINE_NAMES, data for
  *     FETCHER_NUMBERS drop-down
  */
 function manageMachines()
 {
     $parent = $this->parent;
     $machine_model = $parent->model("machine");
     $profile_model = $parent->model("profile");
     $data = array();
     $data["ELEMENT"] = "managemachines";
     $possible_arguments = array("addmachine", "deletemachine", "newsmode", "log", "update");
     $data['SCRIPT'] = "doUpdate();";
     $data["leftorright"] = getLocaleDirection() == 'ltr' ? "right" : "left";
     $data['MACHINE_NAMES'] = array();
     $data['FETCHER_NUMBERS'] = array(0 => 0, 1 => 1, 2 => 2, 3 => 3, 4 => 4, 5 => 5, 6 => 6, 7 => 7, 8 => 8, 16 => 16);
     $tmp = tl('system_component_select_machine');
     if (!isset($_REQUEST["has_queue_server"]) || isset($_REQUEST['is_replica'])) {
         $_REQUEST["has_queue_server"] = false;
     }
     if (isset($_REQUEST['is_replica'])) {
         $_REQUEST['num_fetchers'] = 0;
     } else {
         $_REQUEST['parent'] = "";
     }
     $request_fields = array("name" => "string", "url" => "string", "has_queue_server" => "bool", "num_fetchers" => "int", "parent" => "string");
     $r = array();
     $allset = true;
     foreach ($request_fields as $field => $type) {
         if (isset($_REQUEST[$field])) {
             $r[$field] = $parent->clean($_REQUEST[$field], $type);
             if ($type == "string") {
                 $r[$field] = trim($r[$field]);
                 if ($r[$field] == "" && $field != "parent") {
                     $allset = false;
                 }
             }
             if ($field == "url") {
                 if (isset($r[$field][strlen($r[$field]) - 1]) && $r[$field][strlen($r[$field]) - 1] != "/") {
                     $r[$field] .= "/";
                 }
                 $r[$field] = UrlParser::canonicalLink($r[$field], NAME_SERVER);
                 if (!$r[$field]) {
                     $allset = false;
                 }
             }
         } else {
             $allset = false;
         }
     }
     if (isset($r["num_fetchers"]) && in_array($r["num_fetchers"], $data['FETCHER_NUMBERS'])) {
         $data['FETCHER_NUMBER'] = $r["num_fetchers"];
     } else {
         $data['FETCHER_NUMBER'] = 0;
         if (isset($r["num_fetchers"])) {
             $r["num_fetchers"] = 0;
         }
     }
     $machine_exists = isset($r["name"]) && $machine_model->checkMachineExists("NAME", $r["name"]) || isset($r["url"]) && $machine_model->checkMachineExists("URL", $r["url"]);
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addmachine":
                 if ($allset == true && !$machine_exists) {
                     $machine_model->addMachine($r["name"], $r["url"], $r["has_queue_server"], $r["num_fetchers"], $r["parent"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_added') . "</h1>');";
                     $data['MACHINE_NAMES'][] = $r["name"];
                     $data['DELETABLE_MACHINES'][$r["name"]] = $r["name"];
                     sort($data['MACHINE_NAMES']);
                 } else {
                     if ($allset && $machine_exists) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_exists') . "</h1>');";
                     } else {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_incomplete') . "</h1>');";
                     }
                 }
                 break;
             case "deletemachine":
                 if (!$machine_exists) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_doesnt_exists') . "</h1>');";
                 } else {
                     $machines = $machine_model->getRows(0, 1, $total_rows, array(array("name", "=", $r["name"], "")));
                     $service_in_use = false;
                     foreach ($machines as $machine) {
                         if ($machine['NAME'] == $r["name"]) {
                             if (isset($machine['STATUSES']) && is_array($machine['STATUSES']) && $machine['STATUSES'] != array()) {
                                 $service_in_use = true;
                                 break;
                             } else {
                                 break;
                             }
                         }
                     }
                     if ($service_in_use) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_stop_service_first') . "</h1>');";
                         break;
                     }
                     $machine_model->deleteMachine($r["name"]);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_deleted') . "</h1>');";
                 }
                 break;
             case "newsmode":
                 $profile = $profile_model->getProfile(WORK_DIRECTORY);
                 $news_modes = array("news_off", "news_web", "news_process");
                 if (isset($_REQUEST['news_mode']) && in_array($_REQUEST['news_mode'], $news_modes)) {
                     $profile["NEWS_MODE"] = $_REQUEST['news_mode'];
                     if ($profile["NEWS_MODE"] != "news_process") {
                         CrawlDaemon::stop("news_updater", "", false);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     } else {
                         CrawlDaemon::start("news_updater", 'none', "", -1);
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_mode_updated') . "</h1>');";
                     }
                     $profile_model->updateProfile(WORK_DIRECTORY, array(), $profile);
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_news_update_failed') . "</h1>');";
                 }
                 break;
             case "log":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 }
                 if (isset($_REQUEST["mirror_name"])) {
                     $r["mirror_name"] = $parent->clean($_REQUEST["mirror_name"], "string");
                 }
                 if (isset($_REQUEST["time"])) {
                     $data["time"] = $parent->clean($_REQUEST["time"], "int") + 30;
                 } else {
                     $data["time"] = 30;
                 }
                 if (isset($_REQUEST["NO_REFRESH"])) {
                     $data["NO_REFRESH"] = $parent->clean($_REQUEST["NO_REFRESH"], "bool");
                 } else {
                     $data["NO_REFRESH"] = false;
                 }
                 $data["ELEMENT"] = "machinelog";
                 $filter = "";
                 if (isset($_REQUEST['f'])) {
                     $filter = $parent->clean($_REQUEST['f'], "string");
                 }
                 $data['filter'] = $filter;
                 $data["REFRESH_LOG"] = "&time=" . $data["time"];
                 $data["LOG_TYPE"] = "";
                 if (isset($r['fetcher_num']) && isset($r['name'])) {
                     $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], $r["fetcher_num"], $filter);
                     $data["LOG_TYPE"] = $r['name'] . " fetcher " . $r["fetcher_num"];
                     $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'] . "&fetcher_num=" . $r['fetcher_num'];
                 } else {
                     if (isset($r["mirror_name"])) {
                         $data["LOG_TYPE"] = $r['mirror_name'] . " mirror";
                         $data["LOG_FILE_DATA"] = $machine_model->getLog($r["mirror_name"], NULL, $filter, true);
                     } else {
                         if (isset($r['name'])) {
                             $data["LOG_TYPE"] = $r['name'] . " queue_server";
                             if ($r['name'] == "news") {
                                 $data["LOG_TYPE"] = "Name Server News Updater";
                             }
                             $data["LOG_FILE_DATA"] = $machine_model->getLog($r["name"], NULL, $filter);
                             $data["REFRESH_LOG"] .= "&arg=log&name=" . $r['name'];
                         }
                     }
                 }
                 if ($data["time"] >= ONE_HOUR / 3) {
                     $data["REFRESH_LOG"] = "";
                 }
                 if (!isset($data["LOG_FILE_DATA"]) || $data["LOG_FILE_DATA"] == "") {
                     $data["LOG_FILE_DATA"] = tl('system_component_no_machine_log');
                 }
                 $lines = array_reverse(explode("\n", $data["LOG_FILE_DATA"]));
                 $data["LOG_FILE_DATA"] = implode("\n", $lines);
                 break;
             case "update":
                 if (isset($_REQUEST["fetcher_num"])) {
                     $r["fetcher_num"] = $parent->clean($_REQUEST["fetcher_num"], "int");
                 } else {
                     $r["fetcher_num"] = NULL;
                 }
                 $available_actions = array("start", "stop", "mirror_start", "mirror_stop");
                 if (isset($r["name"]) && isset($_REQUEST["action"]) && in_array($_REQUEST["action"], $available_actions)) {
                     $action = $_REQUEST["action"];
                     $is_mirror = false;
                     if ($action == "mirror_start") {
                         $action = "start";
                         $is_mirror = true;
                     } else {
                         if ($action == "mirror_stop") {
                             $action = "stop";
                             $is_mirror = true;
                         }
                     }
                     $machine_model->update($r["name"], $action, $r["fetcher_num"], $is_mirror);
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_servers_updated') . "</h1>');";
                 } else {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('system_component_machine_no_action') . "</h1>');";
                 }
                 break;
         }
     }
     $parent->pagingLogic($data, $machine_model, "MACHINE", DEFAULT_ADMIN_PAGING_NUM);
     if (!isset($_REQUEST['arg']) || $_REQUEST['arg'] != 'log') {
         $data['SCRIPT'] .= "toggleReplica(false);";
     }
     return $data;
 }

Example #4

Show file

File: sitemap_processor.php Project: yakar/yioop

 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }

Example #5

Show file

File: crawl_component.php Project: yakar/yioop

 /**
  * Handles admin request related to the search sources activity
  *
  * The search sources activity allows a user to add/delete search sources
  * for video and news, it also allows a user to control which subsearches
  * appear on the SearchView page
  *
  * @return array $data info about current search sources, and current
  *     sub-searches
  */
 function searchSources()
 {
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $source_model = $parent->model("source");
     $possible_arguments = array("addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch");
     $data = array();
     $data["ELEMENT"] = "searchsources";
     $data['SCRIPT'] = "";
     $data['SOURCE_TYPES'] = array(-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "html" => tl('crawl_component_html_feed'));
     $source_type_flag = false;
     if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) {
         $data['SOURCE_TYPE'] = $_REQUEST['type'];
         $source_type_flag = true;
     } else {
         $data['SOURCE_TYPE'] = -1;
     }
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data["SEARCH_LISTS"] = array(-1 => tl('crawl_component_sources_indexes'));
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["i:" . $item["CRAWL_TIME"]] = $item["DESCRIPTION"];
     }
     if (isset($_SESSION['USER_ID'])) {
         $user = $_SESSION['USER_ID'];
     } else {
         $user = $_SERVER['REMOTE_ADDR'];
     }
     $search_lists = $crawl_model->getMixList($user);
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["m:" . $item["TIMESTAMP"]] = $item["NAME"];
     }
     $n = NUM_RESULTS_PER_PAGE;
     $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n);
     if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) {
         $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page'];
     } else {
         $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE;
     }
     $locales = $parent->model("locale")->getLocaleList();
     $data["LANGUAGES"] = array();
     foreach ($locales as $locale) {
         $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME'];
     }
     if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) {
         $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language'];
     } else {
         $data['SOURCE_LOCALE_TAG'] = DEFAULT_LOCALE;
     }
     $data["CURRENT_SOURCE"] = array("name" => "", "type" => $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'channel_path' => "", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']);
     $data["CURRENT_SUBSEARCH"] = array("locale_string" => "", "folder_name" => "", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']);
     $data['SOURCE_FORM_TYPE'] = "addsource";
     $data["SEARCH_FORM_TYPE"] = "addsubsearch";
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addsource":
                 if (!$source_type_flag) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_source_type') . "</h1>');";
                     break;
                 }
                 $must_have = array("name", "type", 'source_url');
                 $is_html_feed = false;
                 if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'html') {
                     $is_html_feed = true;
                     $must_have = array_merge($must_have, array('channel_path', 'item_path', 'title_path', 'description_path', 'link_path'));
                 }
                 $to_clean = array_merge($must_have, array('aux_info', 'language'));
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if ($clean_me == "source_url") {
                         $r[$clean_me] = UrlParser::canonicalLink($r[$clean_me], NAME_SERVER);
                         echo $r[$clean_me] . "\n";
                         if (!$r[$clean_me]) {
                             $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_invalid_url') . "</h1>');";
                             break 2;
                         }
                     }
                     if (in_array($clean_me, $must_have) && $r[$clean_me] == "") {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 if ($is_html_feed) {
                     $r['aux_info'] = $r['channel_path'] . "###" . $r['item_path'] . "###" . $r['title_path'] . "###" . $r['description_path'] . "###" . $r['link_path'];
                 }
                 $source_model->addMediaSource($r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_added') . "</h1>');";
                 break;
             case "addsubsearch":
                 $to_clean = array("folder_name", 'index_identifier');
                 $must_have = $to_clean;
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 $source_model->addSubsearch($r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_added') . "</h1>');";
                 break;
             case "deletesource":
                 if (!isset($_REQUEST['ts'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $timestamp = $parent->clean($_REQUEST['ts'], "string");
                 $source_model->deleteMediaSource($timestamp);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_deleted') . "</h1>');";
                 break;
             case "deletesubsearch":
                 if (!isset($_REQUEST['fn'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $folder_name = $parent->clean($_REQUEST['fn'], "string");
                 $source_model->deleteSubsearch($folder_name);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_deleted') . "</h1>');";
                 break;
             case "editsubsearch":
                 $data['SEARCH_FORM_TYPE'] = "editsubsearch";
                 $subsearch = false;
                 $folder_name = isset($_REQUEST['fn']) ? $parent->clean($_REQUEST['fn'], "string") : "";
                 if ($folder_name) {
                     $subsearch = $source_model->getSubsearch($folder_name);
                 }
                 if (!$subsearch) {
                     $data['SOURCE_FORM_TYPE'] = "addsubsearch";
                     break;
                 }
                 $data['fn'] = $folder_name;
                 $update = false;
                 foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $subsearch[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         $update = true;
                     } else {
                         if (isset($subsearch[$upper_field])) {
                             $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     $source_model->updateSubsearch($subsearch);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_updated') . "</h1>');";
                 }
                 break;
             case "editsource":
                 $data['SOURCE_FORM_TYPE'] = "editsource";
                 $source = false;
                 $timestamp = isset($_REQUEST['ts']) ? $parent->clean($_REQUEST['ts'], "string") : "";
                 if ($timestamp) {
                     $source = $source_model->getMediaSource($timestamp);
                 }
                 if (!$source) {
                     $data['SOURCE_FORM_TYPE'] = "addsource";
                     break;
                 }
                 $data['ts'] = $timestamp;
                 $update = false;
                 $is_html_feed = false;
                 if ($source['TYPE'] == 'html') {
                     $is_html_feed = true;
                     list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = explode("###", $source['AUX_INFO']);
                 }
                 foreach ($data['CURRENT_SOURCE'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $source[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         $update = true;
                     } else {
                         if (isset($source[$upper_field])) {
                             $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     if ($is_html_feed) {
                         $source['AUX_INFO'] = $source['CHANNEL_PATH'] . "###" . $source['ITEM_PATH'] . "###" . $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###" . $source['LINK_PATH'];
                     }
                     unset($source['CHANNEL_PATH']);
                     unset($source['ITEM_PATH']);
                     unset($source['TITLE_PATH']);
                     unset($source['DESCRIPTION_PATH']);
                     unset($source['LINK_PATH']);
                     $source_model->updateMediaSource($source);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_updated') . "</h1>');";
                 }
                 break;
         }
     }
     $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity($_SESSION['USER_ID'], "manageLocales");
     $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("NAME", "", "", "ASC")));
     $parent->pagingLogic($data, $source_model, "SUBSEARCHES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("FOLDER_NAME", "", "", "ASC")), "SUB", "SUBSEARCH");
     foreach ($data["SUBSEARCHES"] as $search) {
         if (!isset($data["SEARCH_LISTS"][trim($search['INDEX_IDENTIFIER'])])) {
             $source_model->deleteSubsearch($search["FOLDER_NAME"]);
         }
     }
     $data['SCRIPT'] .= "source_type = elt('source-type');" . "source_type.onchange = switchSourceType;" . "switchSourceType()";
     return $data;
 }

Example #6

Show file

File: robot_processor.php Project: yakar/yioop

 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }

Example #7

Show file

File: fetcher.php Project: yakar/yioop

 /**
  * Processes an array of downloaded web pages with the appropriate page
  * processor.
  *
  * Summary data is extracted from each non robots.txt file in the array.
  * Disallowed paths and crawl-delays are extracted from robots.txt files.
  *
  * @param array $site_pages a collection of web pages to process
  * @return array summary data extracted from these pages
  */
 function processFetchPages($site_pages)
 {
     $PAGE_PROCESSORS = $this->page_processors;
     crawlLog("Start process pages... Current Memory:" . memory_get_usage());
     $start_time = microtime();
     $prefix = $this->fetcher_num . "-";
     $stored_site_pages = array();
     $summarized_site_pages = array();
     $num_items = $this->web_archive->count;
     $i = 0;
     foreach ($site_pages as $site) {
         $response_code = $site[self::HTTP_CODE];
         $was_error = false;
         if ($response_code < 200 || $response_code >= 300) {
             crawlLog($site[self::URL] . " response code {$response_code}");
             $host = UrlParser::getHost($site[self::URL]);
             if (!isset($this->hosts_with_errors[$host])) {
                 $this->hosts_with_errors[$host] = 0;
             }
             if ($response_code >= 400 || $response_code < 100) {
                 // < 100 will capture failures to connect which are returned
                 // as strings
                 $was_error = true;
                 $this->hosts_with_errors[$host]++;
             }
             /* we print out errors to std output. We still go ahead and
                   process the page. Maybe it is a cool error page, also
                   this makes sure we don't crawl it again
                */
         }
         // text/robot is my made up mimetype for robots.txt files
         $was_robot_error = false;
         if (isset($site[self::ROBOT_PATHS])) {
             if (!$was_error) {
                 $type = "text/robot";
             } else {
                 $type = $site[self::TYPE];
                 if ($response_code != 404) {
                     /*
                        disallow crawling if robots.txt was any error other
                        that not found
                     */
                     $was_robot_error = true;
                     $site[self::ROBOT_PATHS][] = "/";
                 }
             }
         } else {
             if (isset($site[self::FILE_NAME])) {
                 $extension = UrlParser::getDocumentType($site[self::FILE_NAME]);
                 if ($extension == $this->programming_language_extension['java']) {
                     $type = "text/java";
                 } else {
                     if ($extension == $this->programming_language_extension['py']) {
                         $type = "text/py";
                     } else {
                         $type = $site[self::TYPE];
                     }
                 }
             } else {
                 $type = $site[self::TYPE];
             }
         }
         $handled = false;
         /*deals with short URLs and directs them to the original link
           for robots.txt don't want to introduce stuff that can be
           mis-parsed (we follow redirects in this case anyway) */
         if (isset($site[self::LOCATION]) && count($site[self::LOCATION]) > 0 && strcmp($type, "text/robot") != 0) {
             array_unshift($site[self::LOCATION], $site[self::URL]);
             $tmp_loc = array_pop($site[self::LOCATION]);
             $tmp_loc = UrlParser::canonicalLink($tmp_loc, $site[self::URL]);
             $site[self::LOCATION] = array_push($site[self::LOCATION], $tmp_loc);
             $doc_info = array();
             $doc_info[self::LINKS][$tmp_loc] = "location:" . $site[self::URL];
             $doc_info[self::LOCATION] = true;
             $doc_info[self::DESCRIPTION] = $site[self::URL] . " => " . $tmp_loc;
             $doc_info[self::PAGE] = $doc_info[self::DESCRIPTION];
             $doc_info[self::TITLE] = $site[self::URL];
             $text_data = true;
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             $handled = true;
         } else {
             if (isset($PAGE_PROCESSORS[$type])) {
                 $page_processor = $PAGE_PROCESSORS[$type];
                 if (generalIsA($page_processor, "TextProcessor")) {
                     $text_data = true;
                 } else {
                     $text_data = false;
                 }
             } else {
                 crawlLog("No page processor for mime type: " . $type);
                 crawlLog("Not processing: " . $site[self::URL]);
                 continue;
             }
         }
         if (!$handled) {
             if (isset($this->plugin_processors[$page_processor])) {
                 $processor = new $page_processor($this->plugin_processors[$page_processor], $this->max_description_len, $this->summarizer_option);
             } else {
                 $processor = new $page_processor(array(), $this->max_description_len, $this->summarizer_option);
             }
         }
         if (isset($site[self::PAGE]) && !$handled) {
             if (!isset($site[self::ENCODING])) {
                 $site[self::ENCODING] = "UTF-8";
             }
             //if not UTF-8 convert before doing anything else
             if (isset($site[self::ENCODING]) && $site[self::ENCODING] != "UTF-8" && $site[self::ENCODING] != "" && generalIsA($page_processor, "TextProcessor")) {
                 if (!@mb_check_encoding($site[self::PAGE], $site[self::ENCODING])) {
                     crawlLog("  MB_CHECK_ENCODING FAILED!!");
                 }
                 crawlLog("  Converting from encoding " . $site[self::ENCODING] . "...");
                 //if HEBREW WINDOWS-1255 use ISO-8859 instead
                 if (stristr($site[self::ENCODING], "1255")) {
                     $site[self::ENCODING] = "ISO-8859-8";
                     crawlLog("  using encoding " . $site[self::ENCODING] . "...");
                 }
                 if (stristr($site[self::ENCODING], "1256")) {
                     $site[self::PAGE] = w1256ToUTF8($site[self::PAGE]);
                     crawlLog("  using Yioop hack encoding ...");
                 } else {
                     $site[self::PAGE] = @mb_convert_encoding($site[self::PAGE], "UTF-8", $site[self::ENCODING]);
                 }
             }
             crawlLog("  Using Processor..." . $page_processor);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $tmp_url_store = $site[self::URL];
                 $site[self::URL] = $site[self::FILE_NAME];
             }
             $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $site[self::URL] = $tmp_url_store;
             }
             if (!$doc_info) {
                 crawlLog("  Processing Yielded No Data For: " . $site[self::URL]);
             }
             if ($page_processor != "RobotProcessor" && !isset($doc_info[self::JUST_METAS])) {
                 $this->pruneLinks($doc_info, CrawlConstants::LINKS, $start_time);
             }
         } else {
             if (!$handled) {
                 $doc_info = false;
             }
         }
         $not_loc = true;
         if ($doc_info) {
             $site[self::DOC_INFO] = $doc_info;
             if (isset($doc_info[self::LOCATION])) {
                 $site[self::HASH] = crawlHash(crawlHash($site[self::URL], true) . "LOCATION", true);
                 $not_loc = false;
             }
             $site[self::ROBOT_INSTANCE] = $prefix . ROBOT_INSTANCE;
             if (!is_dir(CRAWL_DIR . "/cache")) {
                 mkdir(CRAWL_DIR . "/cache");
                 $htaccess = "Options None\nphp_flag engine off\n";
                 file_put_contents(CRAWL_DIR . "/cache/.htaccess", $htaccess);
             }
             if ($type == "text/robot" && isset($doc_info[self::PAGE])) {
                 $site[self::PAGE] = $doc_info[self::PAGE];
             }
             if ($text_data) {
                 if (isset($doc_info[self::PAGE])) {
                     $site[self::PAGE] = $doc_info[self::PAGE];
                 } else {
                     $site[self::PAGE] = NULL;
                 }
                 if ($not_loc) {
                     $content = $doc_info[self::DESCRIPTION];
                     $site[self::HASH] = FetchUrl::computePageHash($content);
                 }
             } else {
                 $site[self::HASH] = FetchUrl::computePageHash($site[self::PAGE]);
             }
             if (isset($doc_info[self::WORD_CLOUD])) {
                 $site[self::WORD_CLOUD] = $doc_info[self::WORD_CLOUD];
             } else {
                 $site[self::WORD_CLOUD] = NULL;
             }
             if (isset($doc_info[self::CRAWL_DELAY])) {
                 $site[self::CRAWL_DELAY] = $doc_info[self::CRAWL_DELAY];
             }
             if (isset($doc_info[self::ROBOT_PATHS]) && !$was_error) {
                 $site[self::ROBOT_PATHS] = $doc_info[self::ROBOT_PATHS];
             }
             if (!isset($site[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array();
             }
             if (isset($doc_info[self::ROBOT_METAS])) {
                 $site[self::ROBOT_METAS] = array_merge($site[self::ROBOT_METAS], $doc_info[self::ROBOT_METAS]);
             }
             //here's where we enforce NOFOLLOW
             if (in_array("NOFOLLOW", $site[self::ROBOT_METAS]) || in_array("NONE", $site[self::ROBOT_METAS])) {
                 $site[self::DOC_INFO][self::LINKS] = array();
             }
             if (isset($doc_info[self::AGENT_LIST])) {
                 $site[self::AGENT_LIST] = $doc_info[self::AGENT_LIST];
             }
             $this->copySiteFields($i, $site, $summarized_site_pages, $stored_site_pages);
             $summarized_site_pages[$i][self::URL] = strip_tags($site[self::URL]);
             if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                 $summarized_site_pages[$i][self::TITLE] = $site[self::FILE_NAME];
             } else {
                 $summarized_site_pages[$i][self::TITLE] = strip_tags($site[self::DOC_INFO][self::TITLE]);
                 // stripping html to be on the safe side
             }
             if (!isset($site[self::REPOSITORY_TYPE])) {
                 if ($was_robot_error) {
                     $site[self::DOC_INFO][self::DESCRIPTION] = "There was an HTTP error in trying to download " . "this robots.txt file, so all paths to this site " . "were dsallowed by Yioop.\n" . $site[self::DOC_INFO][self::DESCRIPTION];
                 }
                 $summarized_site_pages[$i][self::DESCRIPTION] = strip_tags($site[self::DOC_INFO][self::DESCRIPTION]);
             } else {
                 $summarized_site_pages[$i][self::DESCRIPTION] = $site[self::DOC_INFO][self::DESCRIPTION];
             }
             if (isset($site[self::DOC_INFO][self::JUST_METAS]) || isset($site[self::ROBOT_PATHS])) {
                 $summarized_site_pages[$i][self::JUST_METAS] = true;
             }
             if (isset($site[self::DOC_INFO][self::META_WORDS])) {
                 if (!isset($summarized_site_pages[$i][self::META_WORDS])) {
                     $summarized_site_pages[$i][self::META_WORDS] = $site[self::DOC_INFO][self::META_WORDS];
                 } else {
                     $summarized_site_pages[$i][self::META_WORDS] = array_merge($summarized_site_pages[$i][self::META_WORDS], $site[self::DOC_INFO][self::META_WORDS]);
                 }
             }
             if (isset($site[self::DOC_INFO][self::LANG])) {
                 if ($site[self::DOC_INFO][self::LANG] == 'en' && $site[self::ENCODING] != "UTF-8") {
                     $site[self::DOC_INFO][self::LANG] = guessLangEncoding($site[self::ENCODING]);
                 }
                 $summarized_site_pages[$i][self::LANG] = $site[self::DOC_INFO][self::LANG];
             }
             if (isset($site[self::DOC_INFO][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = $site[self::DOC_INFO][self::LINKS];
             }
             if (isset($site[self::DOC_INFO][self::WORD_CLOUD])) {
                 $summarized_site_pages[$i][self::WORD_CLOUD] = $site[self::DOC_INFO][self::WORD_CLOUD];
             }
             if (isset($site[self::DOC_INFO][self::THUMB])) {
                 $summarized_site_pages[$i][self::THUMB] = $site[self::DOC_INFO][self::THUMB];
             }
             if (isset($site[self::DOC_INFO][self::SUBDOCS])) {
                 $this->processSubdocs($i, $site, $summarized_site_pages, $stored_site_pages);
             }
             if (isset($summarized_site_pages[$i][self::LINKS])) {
                 $summarized_site_pages[$i][self::LINKS] = UrlParser::cleanRedundantLinks($summarized_site_pages[$i][self::LINKS], $summarized_site_pages[$i][self::URL]);
             }
             if (!empty($this->classifiers)) {
                 Classifier::labelPage($summarized_site_pages[$i], $this->classifiers, $this->active_classifiers, $this->active_rankers);
             }
             if ($this->page_rule_parser != NULL) {
                 $this->page_rule_parser->executeRuleTrees($summarized_site_pages[$i]);
             }
             $metas = isset($summarized_site_pages[$i][self::ROBOT_METAS]) ? $summarized_site_pages[$i][self::ROBOT_METAS] : array();
             if (array_intersect($metas, array("NOARCHIVE", "NOINDEX", "JUSTFOLLOW", "NONE")) != array()) {
                 $stored_site_pages[$i] = false;
             }
             $stored_site_pages[$i][self::INDEX] = $i;
             $i++;
         }
     }
     // end for
     $num_pages = count($stored_site_pages);
     $filter_stored = array_filter($stored_site_pages);
     if ($num_pages > 0 && $this->cache_pages) {
         $cache_page_partition = $this->web_archive->addPages(self::OFFSET, $filter_stored);
     } else {
         if ($num_pages > 0) {
             $this->web_archive->addCount(count($filter_stored));
         }
     }
     for ($i = 0; $i < $num_pages; $i++) {
         $summarized_site_pages[$i][self::INDEX] = $num_items + $i;
     }
     foreach ($filter_stored as $stored) {
         $i = $stored[self::INDEX];
         if (isset($stored[self::OFFSET])) {
             $summarized_site_pages[$i][self::OFFSET] = $stored[self::OFFSET];
             $summarized_site_pages[$i][self::CACHE_PAGE_PARTITION] = $cache_page_partition;
         }
     }
     crawlLog("  Process pages time: " . changeInMicrotime($start_time) . " Current Memory: " . memory_get_usage());
     return $summarized_site_pages;
 }

Example #8

Show file

File: xlsx_processor.php Project: yakar/yioop

 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $sit  a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink";
     $i = 0;
     $relationships = $dom->getElementsByTagName("Relationships");
     foreach ($relationships as $relationship) {
         $relations = $relationship->getElementsByTagName("Relationship");
         foreach ($relations as $relation) {
             if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) {
                 if ($i < MAX_LINKS_TO_EXTRACT) {
                     $link = $relation->getAttribute('Target');
                     $url = UrlParser::canonicalLink($link, $site);
                     if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) {
                         if (isset($sites[$url])) {
                             $sites[$url] .= " " . $link;
                         } else {
                             $sites[$url] = $link;
                         }
                         $i++;
                     }
                 }
             }
         }
     }
     return $sites;
 }

Example #9

Show file

File: pptx_processor.php Project: yakar/yioop

 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $site a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
     $i = 0;
     foreach ($paras as $para) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue;
             $url = UrlParser::canonicalLink($hlink, $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " " . $hlink;
                 } else {
                     $sites[$url] = $hlink;
                 }
             }
         }
         $i++;
     }
     return $sites;
 }

Example #10

Show file

File: search_controller.php Project: yakar/yioop

 /**
  * Make relative links canonical with respect to provided $url
  * for links appear within the Dom node.
  *
  * @param object $node dom node to fix links for
  * @param string $url url to use to canonicalize links
  * @return object updated dom node
  */
 function canonicalizeLinks($node, $url)
 {
     if (!isset($node->childNodes->length) || get_class($node) != 'DOMElement') {
         return $node;
     }
     for ($k = 0; $k < $node->childNodes->length; $k++) {
         if (!$node->childNodes->item($k)) {
             break;
         }
         $clone = $node->childNodes->item($k)->cloneNode(true);
         $tag_name = isset($clone->tagName) ? $clone->tagName : "-1";
         if (in_array($tag_name, array("a", "link"))) {
             if ($clone->hasAttribute("href")) {
                 $href = $clone->getAttribute("href");
                 if ($href != "" && $href[0] != "#") {
                     $href = UrlParser::canonicalLink($href, $url, false);
                 }
                 /*
                    Modify non-link tag urls so that they are looked up in
                    the cache before going to the live site
                 */
                 if ($tag_name != "link" && ($href == "" || $href[0] != "#")) {
                     $href = urlencode($href);
                     $href = $href . "&from_cache=true";
                     $crawl_time = $this->getIndexTimestamp();
                     $href = $this->baseLink() . "&a=cache&q&arg" . "={$href}&its={$crawl_time}";
                 }
                 $clone->setAttribute("href", $href);
                 //an anchor might have an img tag within it so recurses
                 $clone = $this->canonicalizeLinks($clone, $url);
                 $node->replaceChild($clone, $node->childNodes->item($k));
             }
         } else {
             if (in_array($tag_name, array("img", "object", "script"))) {
                 if ($clone->hasAttribute("src")) {
                     $src = $clone->getAttribute("src");
                     $src = UrlParser::canonicalLink($src, $url, false);
                     $clone->setAttribute("src", $src);
                     $node->replaceChild($clone, $node->childNodes->item($k));
                 }
             } else {
                 if ($tag_name != -1) {
                     $clone = $this->canonicalizeLinks($clone, $url);
                     if (is_object($clone)) {
                         $node->replaceChild($clone, $node->childNodes->item($k));
                     }
                 }
             }
         }
     }
     return $node;
 }

Example #11

Show file

File: source_model.php Project: yakar/yioop

 /**
  * Downloads one batch of $feeds_one_go feed items for @see updateFeedItems
  * For each feed source downloads the feeds, checks which items are
  * not in the database, adds them. This method does not update
  * the inverted index shard.
  *
  * @param array $feeds list of feeds to download
  * @param int $age how many seconds old records should be ignored
  */
 function updateFeedItemsOneGo($feeds, $age = ONE_WEEK)
 {
     $feeds = FetchUrl::getPages($feeds, false, 0, NULL, "SOURCE_URL", CrawlConstants::PAGE, true, NULL, true);
     $sql = "UPDATE MEDIA_SOURCE SET LANGUAGE=? WHERE TIMESTAMP=?";
     foreach ($feeds as $feed) {
         $is_html = $feed['TYPE'] == 'html' ? true : false;
         crawlLog("Updating {$feed['NAME']}. Making dom object from feed.");
         if (!$feed[CrawlConstants::PAGE]) {
             crawlLog("...No data in feed skipping.");
             continue;
         }
         $dom = new DOMDocument();
         if ($is_html) {
             @$dom->loadHTML($feed[CrawlConstants::PAGE]);
         } else {
             @$dom->loadXML($feed[CrawlConstants::PAGE]);
         }
         crawlLog("...done. Extracting info about whole feed.");
         $lang = "";
         if ($feed['TYPE'] != 'html' && !isset($feed["LANGUAGE"]) || $feed["LANGUAGE"] == "") {
             $languages = $dom->getElementsByTagName('language');
             if ($languages && is_object($languages) && is_object($languages->item(0))) {
                 $lang = $languages->item(0)->textContent;
                 $db->execute($sql, array($lang, $feed['TIMESTAMP']));
             }
         } else {
             if (isset($feed["LANGUAGE"]) && $feed["LANGUAGE"] != "") {
                 $lang = $feed["LANGUAGE"];
             } else {
                 $lang = DEFAULT_LOCALE;
             }
         }
         crawlLog("...Language is {$lang}. Getting channel, finding nodes.");
         if ($is_html) {
             $sub_dom = $this->getTags($dom, $feed['CHANNEL_PATH']);
             if (!$sub_dom) {
                 crawlLog("... Scraper couldn't parse channel" . " path so bailing on this feed.");
                 continue;
             } else {
                 crawlLog("...Channel scraped.");
             }
             $nodes = $this->getTags($sub_dom[0], $feed['ITEM_PATH']);
             $rss_elements = array("title" => $feed['TITLE_PATH'], "description" => $feed['DESCRIPTION_PATH'], "link" => $feed['LINK_PATH']);
         } else {
             $nodes = $dom->getElementsByTagName('item');
             $rss_elements = array("title" => "title", "description" => "description", "link" => "link", "guid" => "guid", "pubDate" => "pubDate");
             if ($nodes->length == 0) {
                 // maybe we're dealing with atom rather than rss
                 $nodes = $dom->getElementsByTagName('entry');
                 $rss_elements = array("title" => "title", "description" => "summary", "link" => "link", "guid" => "id", "pubDate" => "updated");
             }
         }
         crawlLog("...done extracting info. Check for new news " . "items in {$feed['NAME']}.");
         $num_added = 0;
         $num_seen = 0;
         foreach ($nodes as $node) {
             $item = array();
             foreach ($rss_elements as $db_element => $feed_element) {
                 crawlTimeoutLog("..still adding feed items to index.");
                 if ($is_html) {
                     $tag_nodes = $this->getTags($node, $feed_element);
                     if (!isset($tag_nodes[0])) {
                         $tag_node = NULL;
                     } else {
                         $tag_node = $tag_nodes[0];
                     }
                     $element_text = is_object($tag_node) ? $tag_node->textContent : "";
                 } else {
                     $tag_node = $node->getElementsByTagName($feed_element)->item(0);
                     $element_text = is_object($tag_node) ? $tag_node->nodeValue : "";
                 }
                 if ($db_element == "link" && $tag_node && ($element_text == "" || $is_html)) {
                     if ($is_html) {
                         $element_text = $tag_node->documentElement->getAttribute("href");
                     } else {
                         $element_text = $tag_node->getAttribute("href");
                     }
                     $element_text = UrlParser::canonicalLink($element_text, $feed["SOURCE_URL"]);
                 }
                 $item[$db_element] = strip_tags($element_text);
             }
             $did_add = $this->addFeedItemIfNew($item, $feed['NAME'], $lang, $age);
             if ($did_add) {
                 $num_added++;
             }
             $num_seen++;
         }
         crawlLog("...added {$num_added} news items of {$num_seen} " . "on rss page.\n Done Processing {$feed['NAME']}.");
     }
 }

Example #12

Show file

File: rss_processor.php Project: yakar/yioop

 /**
  * Returns a url text pair where the url comes from the link of
  * the given item node and the text comes from the text data for that node.
  * urls are canonicalized according to site.
  *
  * @param object $item_node the DOMNode to get a link and text from
  * @param string $link_name name of link tag
  * @param string $text_name name of text tag to associate with link
  * @param string $site   a string containing a url
  * @param bool $atom if the feed is atom or rss
  *
  * @return array a url,text pair
  */
 static function linkAndTexts($item_node, $link_name, $text_name, $site, $atom = false)
 {
     foreach ($item_node->childNodes as $node) {
         if ($node->nodeName == $link_name) {
             if (!$atom) {
                 $url = UrlParser::canonicalLink($node->textContent, $site);
             } else {
                 $url = UrlParser::canonicalLink($node->getAttribute("href"), $site);
             }
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || strlen($url) >= MAX_URL_LEN) {
                 return false;
             }
         }
         if ($node->nodeName == $text_name) {
             $text = $node->textContent;
             if ($text == "") {
                 $text = "RSS Feed";
                 if ($atom) {
                     $text = "Atom Feed";
                 }
             }
         }
     }
     if (!isset($url) || $url == "") {
         return false;
     }
     $text = mb_ereg_replace("(\\s)+", " ", $text);
     return array($url, $text);
 }