Пример #1
0
 /**
  * 在这个方法内添加新 URL 过滤规则,主要是调用以下方法:
  * followExternal()
  * allowDomain(), disallowDomain()
  * allow(), disallow(), disallowExt()
  */
 public function defaultFilter()
 {
     parent::defaultFilter();
     /// --- custom filter BEGIN ---
     $this->followExternal(false);
     $this->disallow('.php?q=');
     /// --- custom filter END ---
 }
Пример #2
0
 /**
  * {@inheritDoc}
  *
  * @param string $page  the image represented as a character string
  * @param string $url  the url where the image was downloaded from
  * @return array summary information including a thumbnail and a
  *     description (where the description is just the url)
  */
 function process($page, $url)
 {
     if (is_string($page)) {
         $image = $this->imagecreatefrombmp($page);
         $thumb_string = self::createThumb($image);
         $summary[self::TITLE] = "";
         $summary[self::DESCRIPTION] = "Image of " . UrlParser::getDocumentFilename($url);
         $summary[self::LINKS] = array();
         $summary[self::PAGE] = "<html><body><div><img src='data:image/bmp;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>";
         $summary[self::THUMB] = 'data:image/jpeg;base64,' . base64_encode($thumb_string);
     }
     return $summary;
 }
Пример #3
0
 /**
  * Used to recompute both the index shards and the dictionary
  * of an index archive. The first step involves re-extracting the
  * word into an inverted index from the summaries' web_archives.
  * Then a reindex is done.
  *
  * @param string $archive_path file path to a IndexArchiveBundle
  */
 function rebuildIndexArchive($archive_path)
 {
     $archive_type = $this->getArchiveKind($archive_path);
     if ($archive_type != "IndexArchiveBundle") {
         $this->badFormatMessageAndExit($archive_path);
     }
     $info = $archive_type::getArchiveInfo($archive_path);
     $tmp = unserialize($info["DESCRIPTION"]);
     $video_sources = $tmp[self::VIDEO_SOURCES];
     $generation_info = unserialize(file_get_contents("{$archive_path}/generation.txt"));
     $num_generations = $generation_info['ACTIVE'] + 1;
     $archive = new WebArchiveBundle($archive_path . "/summaries");
     $seen = 0;
     $generation = 0;
     $keypad = "";
     while ($generation < $num_generations) {
         $partition = $archive->getPartition($generation, false);
         $shard_name = $archive_path . "/posting_doc_shards/index{$generation}";
         crawlLog("Processing partition {$generation}");
         if (file_exists($shard_name)) {
             crawlLog("..Unlinking old shard {$generation}");
             @unlink($shard_name);
         }
         $shard = new IndexShard($shard_name, $generation, NUM_DOCS_PER_GENERATION, true);
         $seen_partition = 0;
         while ($seen_partition < $partition->count) {
             $num_to_get = min($partition->count - $seen_partition, 8000);
             $offset = $partition->iterator_pos;
             $objects = $partition->nextObjects($num_to_get);
             $cnt = 0;
             foreach ($objects as $object) {
                 $cnt++;
                 $site = $object[1];
                 if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
                     $is_link = true;
                     $doc_keys = $site[self::HTTP_CODE];
                     $site_url = $site[self::TITLE];
                     $host = UrlParser::getHost($site_url);
                     $link_parts = explode('|', $site[self::HASH]);
                     if (isset($link_parts[5])) {
                         $link_origin = $link_parts[5];
                     } else {
                         $link_origin = $site_url;
                     }
                     $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
                     $link_to = "LINK TO:";
                 } else {
                     $is_link = false;
                     $site_url = str_replace('|', "%7C", $site[self::URL]);
                     $host = UrlParser::getHost($site_url);
                     $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
                     $meta_ids = PhraseParser::calculateMetas($site, $video_sources);
                     $link_to = "";
                 }
                 $so_far_cnt = $seen_partition + $cnt;
                 $time_out_message = "..still processing {$so_far_cnt} " . "of {$partition->count} in partition {$generation}." . "\n..Last processed was: " . ($seen + 1) . ". {$link_to}{$site_url}. ";
                 crawlTimeoutLog($time_out_message);
                 $seen++;
                 $word_lists = array();
                 /*
                    self::JUST_METAS check to avoid getting sitemaps in
                    results for popular words
                 */
                 $lang = NULL;
                 if (!isset($site[self::JUST_METAS])) {
                     $host_words = UrlParser::getWordsIfHostUrl($site_url);
                     $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
                     if ($is_link) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                     if (isset($site[self::LANG])) {
                         $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                     }
                     $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
                     $len = strlen($phrase_string);
                     if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                         $meta_ids[] = "safe:true";
                         $safe = true;
                     } else {
                         $meta_ids[] = "safe:false";
                         $safe = false;
                     }
                 }
                 if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
                     $score_keys = "";
                     foreach ($site[self::USER_RANKS] as $label => $score) {
                         $score_keys .= packInt($score);
                     }
                     if (strlen($score_keys) % 8 != 0) {
                         $score_keys .= $keypad;
                     }
                     $doc_keys .= $score_keys;
                 }
                 $shard->addDocumentWords($doc_keys, $offset, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, false);
                 $offset = $object[0];
             }
             $seen_partition += $num_to_get;
         }
         $shard->save(false, true);
         $generation++;
     }
     $this->reindexIndexArchive($archive_path);
 }
Пример #4
0
 /**
  * Checks if getScheme is working okay
  */
 function getSchemeTestCase()
 {
     $test_links = array(array("http://www.example.com/", "http", "Simple HTTP 1"), array("https://www.example.com/", "https", "Simple HTTPS 1"), array("gopher://www.example.com/", "gopher", "Simple GOPHER 1"), array("./", "http", "Simple HTTP 2"));
     foreach ($test_links as $test_link) {
         $result = UrlParser::getScheme($test_link[0]);
         $this->assertEqual($result, $test_link[1], $test_link[2]);
     }
 }
Пример #5
0
    /**
     * Responsible for handling admin request related to the configure activity
     *
     * The configure activity allows a user to set the work directory for
     * storing data local to this SeekQuarry/Yioop instance. It also allows one
     * to set the default language of the installation, dbms info, robot info,
     * test info, as well as which machine acts as the queue server.
     *
     * @return array $data fields for available language, dbms, etc as well as
     *     results of processing sub activity if any
     */
    function configure()
    {
        $parent = $this->parent;
        $profile_model = $parent->model("profile");
        $group_model = $parent->model("group");
        $data = array();
        $profile = array();
        $data['SYSTEM_CHECK'] = $this->systemCheck();
        $languages = $parent->model("locale")->getLocaleList();
        foreach ($languages as $language) {
            $data['LANGUAGES'][$language['LOCALE_TAG']] = $language['LOCALE_NAME'];
        }
        if (isset($_REQUEST['lang']) && $_REQUEST['lang']) {
            $data['lang'] = $parent->clean($_REQUEST['lang'], "string");
            $profile['DEFAULT_LOCALE'] = $data['lang'];
            setLocaleObject($data['lang']);
        }
        $data["ELEMENT"] = "configure";
        $data['SCRIPT'] = "";
        $data['PROFILE'] = false;
        if (isset($_REQUEST['WORK_DIRECTORY']) || defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER) {
            if (defined('WORK_DIRECTORY') && defined('FIX_NAME_SERVER') && FIX_NAME_SERVER && !isset($_REQUEST['WORK_DIRECTORY'])) {
                $_REQUEST['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $_REQUEST['arg'] = "directory";
                @unlink($_REQUEST['WORK_DIRECTORY'] . "/profile.php");
            }
            $dir = $parent->clean($_REQUEST['WORK_DIRECTORY'], "string");
            $data['PROFILE'] = true;
            if (strstr(PHP_OS, "WIN")) {
                //convert to forward slashes so consistent with rest of code
                $dir = str_replace("\\", "/", $dir);
                if ($dir[0] != "/" && $dir[1] != ":") {
                    $data['PROFILE'] = false;
                }
            } else {
                if ($dir[0] != "/") {
                    $data['PROFILE'] = false;
                }
            }
            if ($data['PROFILE'] == false) {
                $data["MESSAGE"] = tl('system_component_configure_use_absolute_path');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            if (strstr($dir . "/", BASE_DIR . "/")) {
                $data['PROFILE'] = false;
                $data["MESSAGE"] = tl('system_component_configure_configure_diff_base_dir');
                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                $data['WORK_DIRECTORY'] = $dir;
                return $data;
            }
            $data['WORK_DIRECTORY'] = $dir;
        } else {
            if (defined("WORK_DIRECTORY") && strlen(WORK_DIRECTORY) > 0 && strcmp(realpath(WORK_DIRECTORY), realpath(BASE_DIR)) != 0 && (is_dir(WORK_DIRECTORY) || is_dir(WORK_DIRECTORY . "../"))) {
                $data['WORK_DIRECTORY'] = WORK_DIRECTORY;
                $data['PROFILE'] = true;
            }
        }
        $arg = "";
        if (isset($_REQUEST['arg'])) {
            $arg = $_REQUEST['arg'];
        }
        switch ($arg) {
            case "directory":
                if (!isset($data['WORK_DIRECTORY'])) {
                    break;
                }
                if ($data['PROFILE'] && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                    $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                    $data["MESSAGE"] = tl('system_component_configure_work_dir_set');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');setTimeout(" . "'window.location.href=window.location.href', 3000);";
                } else {
                    if ($data['PROFILE'] && strlen($data['WORK_DIRECTORY']) > 0) {
                        if ($profile_model->makeWorkDirectory($data['WORK_DIRECTORY'])) {
                            $profile['DBMS'] = 'sqlite3';
                            $data['DBMS'] = 'sqlite3';
                            $profile['DB_NAME'] = 'default';
                            $data['DB_NAME'] = 'default';
                            $profile['USER_AGENT_SHORT'] = tl('system_component_name_your_bot');
                            $data['USER_AGENT_SHORT'] = $profile['USER_AGENT_SHORT'];
                            $uri = UrlParser::getPath($_SERVER['REQUEST_URI']);
                            $http = isset($_SERVER['HTTPS']) ? "https://" : "http://";
                            $profile['NAME_SERVER'] = $http . $_SERVER['SERVER_NAME'] . $uri;
                            $data['NAME_SERVER'] = $profile['NAME_SERVER'];
                            $profile['AUTH_KEY'] = crawlHash($data['WORK_DIRECTORY'] . time());
                            $data['AUTH_KEY'] = $profile['AUTH_KEY'];
                            $profile['FIAT_SHAMIR_MODULUS'] = generateFiatShamirModulus();
                            $robot_instance = str_replace(".", "_", $_SERVER['SERVER_NAME']) . "-" . time();
                            $profile['ROBOT_INSTANCE'] = $robot_instance;
                            $data['ROBOT_INSTANCE'] = $profile['ROBOT_INSTANCE'];
                            if ($profile_model->updateProfile($data['WORK_DIRECTORY'], array(), $profile)) {
                                if (defined('WORK_DIRECTORY') && $data['WORK_DIRECTORY'] == WORK_DIRECTORY || $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY'])) {
                                    $data["MESSAGE"] = tl('system_component_configure_work_profile_made');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                                    $data['PROFILE'] = true;
                                } else {
                                    $data['PROFILE'] = false;
                                    $data["MESSAGE"] = tl('system_component_configure_no_set_config');
                                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href= " . "window.location.href', 3000);";
                                }
                            } else {
                                $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                                $data['PROFILE'] = false;
                                $data["MESSAGE"] = tl('system_component_configure_no_create_profile');
                                $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>'); setTimeout('window.location.href=" . "window.location.href', 3000);";
                            }
                        } else {
                            $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                            $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                            $data['PROFILE'] = false;
                        }
                    } else {
                        $profile_model->setWorkDirectoryConfigFile($data['WORK_DIRECTORY']);
                        $data["MESSAGE"] = tl('system_component_configure_work_dir_invalid');
                        $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');" . "setTimeout('window.location.href=" . "window.location.href', 3000);";
                        $data['PROFILE'] = false;
                    }
                }
                break;
            case "profile":
                $parent->updateProfileFields($data, $profile, array('WEB_ACCESS', 'RSS_ACCESS', 'API_ACCESS', 'LANDING_PAGE'));
                $data['DEBUG_LEVEL'] = 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["ERROR_INFO"]) ? ERROR_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["QUERY_INFO"]) ? QUERY_INFO : 0;
                $data['DEBUG_LEVEL'] |= isset($_REQUEST["TEST_INFO"]) ? TEST_INFO : 0;
                $profile['DEBUG_LEVEL'] = $data['DEBUG_LEVEL'];
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                $folder = APP_DIR . "/resources";
                if (!file_exists(APP_DIR) && !mkdir(APP_DIR) || !file_exists($folder) && !mkdir($folder)) {
                    $data["MESSAGE"] = tl('system_component_no_resource_folder');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                    return $data;
                }
                foreach (array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH') as $field) {
                    if (isset($_FILES[$field]['name']) && $_FILES[$field]['name'] != "") {
                        if (!in_array($_FILES[$field]['type'], array('image/png', 'image/gif', 'image/jpeg', 'image/x-icon')) && $field != 'SEARCHBAR_PATH' || $_FILES[$field]['type'] != 'text/xml' && $field == 'SEARCHBAR_PATH') {
                            $data["MESSAGE"] = tl('system_component_invalid_filetype');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        if ($_FILES[$field]['size'] > THUMB_SIZE) {
                            $data["MESSAGE"] = tl('system_component_file_too_big');
                            $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>')";
                            return $data;
                        }
                        $profile[$field] = array();
                        $profile[$field]['name'] = $_FILES[$field]['name'];
                        $profile[$field]['tmp_name'] = $_FILES[$field]['tmp_name'];
                        $data[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $profile[$field]['name'];
                    }
                }
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile)) {
                    $data['MESSAGE'] = tl('system_component_configure_profile_change');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                    if ($old_profile['DEBUG_LEVEL'] != $profile['DEBUG_LEVEL']) {
                        $data['SCRIPT'] .= "setTimeout('window.location.href=\"" . "?c=admin&amp;a=configure&amp;" . CSRF_TOKEN . "=" . $_REQUEST[CSRF_TOKEN] . "\"', 3*sec);";
                    }
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            case "reset":
                $base_url = NAME_SERVER;
                if (defined("BASE_URL")) {
                    $base_url = BASE_URL;
                }
                $profile = array('LANDING_PAGE' => false, 'BACKGROUND_COLOR' => "#FFF", 'BACKGROUND_IMAGE' => "", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AUXILIARY_CSS' => "");
                $old_profile = $profile_model->getProfile($data['WORK_DIRECTORY']);
                foreach ($old_profile as $key => $value) {
                    $data[$key] = $value;
                }
                $tmp_image = $old_profile['BACKGROUND_IMAGE'];
                $old_profile['BACKGROUND_IMAGE'] = "";
                if ($profile_model->updateProfile($data['WORK_DIRECTORY'], $profile, $old_profile, true)) {
                    $old_profile['BACKGROUND_IMAGE'] = $tmp_image;
                    foreach ($profile as $key => $value) {
                        $data[$key] = $value;
                        if (in_array($key, array('BACKGROUND_IMAGE', 'LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH')) && $old_profile[$key] != "") {
                            $resource_name = APP_DIR . "/resources/" . $old_profile[$key];
                            if (file_exists($resource_name)) {
                                unlink($resource_name);
                            }
                        }
                    }
                    $data['MESSAGE'] = tl('system_component_configure_reset_completed');
                    $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . $data['MESSAGE'] . "</h1>');";
                } else {
                    $data['PROFILE'] = false;
                    $data["MESSAGE"] = tl('system_component_configure_no_change_profile');
                    $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . $data["MESSAGE"] . "</h1>');";
                    break;
                }
                break;
            default:
                if (isset($data['WORK_DIRECTORY']) && file_exists($data['WORK_DIRECTORY'] . "/profile.php")) {
                    $data = array_merge($data, $profile_model->getProfile($data['WORK_DIRECTORY']));
                } else {
                    $data['WORK_DIRECTORY'] = "";
                    $data['PROFILE'] = false;
                }
        }
        $data['advanced'] = "false";
        if ($data['PROFILE']) {
            $locale_tag = getLocaleTag();
            $not_null_fields = array('LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => BASE_URL . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN");
            foreach ($not_null_fields as $field => $default) {
                if (!$data[$field]) {
                    $data[$field] = $default;
                }
            }
            if (isset($_REQUEST['ROBOT_DESCRIPTION'])) {
                $robot_description = substr($parent->clean($_REQUEST['ROBOT_DESCRIPTION'], "string"), 0, MAX_GROUP_PAGE_LEN);
                $group_model->setPageName(ROOT_ID, PUBLIC_GROUP_ID, "bot", $robot_description, $locale_tag, "", "", "", "");
            }
            $robot_info = $group_model->getPageInfoByName(PUBLIC_GROUP_ID, "bot", $locale_tag, "edit");
            $data['ROBOT_DESCRIPTION'] = isset($robot_info["PAGE"]) ? $robot_info["PAGE"] : tl('system_component_describe_robot');
            if (isset($_REQUEST['advanced']) && $_REQUEST['advanced'] == 'true') {
                $data['advanced'] = "true";
            }
            $data['SCRIPT'] .= <<<EOD
    setDisplay('advance-configure', {$data['advanced']});
    setDisplay('advance-robot', {$data['advanced']});
    function toggleAdvance() {
        var advanced = elt('a-settings');
        advanced.value = (advanced.value =='true')
            ? 'false' : 'true';
        var value = (advanced.value == 'true') ? true : false;
        setDisplay('advance-configure', value);
        setDisplay('advance-robot', value);
    }
EOD;
        }
        $data['SCRIPT'] .= "\nelt('locale').onchange = " . "function () { elt('configureProfileForm').submit();};\n";
        return $data;
    }
Пример #6
0
 /**
  * Returns up to MAX_LINKS_TO_EXTRACT many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $base_refs = $xpath->evaluate("/html//base");
     if ($base_refs->item(0)) {
         $tmp_site = $base_refs->item(0)->getAttribute('href');
         if (strlen($tmp_site) > 0) {
             $site = UrlParser::canonicalLink($tmp_site, $site);
         }
     }
     $i = 0;
     $hrefs = $xpath->evaluate("/html/body//a");
     foreach ($hrefs as $href) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $rel = $href->getAttribute("rel");
             if ($rel == "" || !stristr($rel, "nofollow")) {
                 $url = UrlParser::canonicalLink($href->getAttribute('href'), $site);
                 $len = strlen($url);
                 if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                     $text = $href->nodeValue;
                     if (isset($sites[$url])) {
                         $sites[$url] .= " .. " . preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     } else {
                         $sites[$url] = preg_replace("/\\s+/", " ", strip_tags($text));
                         $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                     }
                     $i++;
                 }
             }
         }
     }
     $frames = $xpath->evaluate("/html/frameset/frame|/html/body//iframe");
     foreach ($frames as $frame) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $url = UrlParser::canonicalLink($frame->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. HTMLframe";
                 } else {
                     $sites[$url] = "HTMLframe";
                 }
                 $i++;
             }
         }
     }
     $imgs = $xpath->evaluate("/html/body//img[@alt]");
     $i = 0;
     foreach ($imgs as $img) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $alt = $img->getAttribute('alt');
             if (strlen($alt) < 1) {
                 continue;
             }
             $url = UrlParser::canonicalLink($img->getAttribute('src'), $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && $len < MAX_URL_LEN && $len > 4) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " .. " . $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 } else {
                     $sites[$url] = $alt;
                     $sites[$url] = mb_substr($sites[$url], 0, 2 * MAX_LINKS_WORD_TEXT);
                 }
                 $i++;
             }
         }
     }
     return $sites;
 }
Пример #7
0
    /**
     * Outputs a profile.php  file in the given directory containing profile
     * data based on new and old data sources
     *
     * This function creates a profile.php file if it doesn't exist. A given
     * field is output in the profile
     * according to the precedence that a new value is preferred to an old
     * value is prefered to the value that comes from a currently defined
     * constant. It might be the case that a new value for a given field
     * doesn't exist, etc.
     *
     * @param string $directory the work directory to output the profile.php
     *     file
     * @param array $new_profile_data fields and values containing at least
     *     some profile information (only $this->profile_fields
     * fields of $new_profile_data will be considered).
     * @param array $old_profile_data fields and values that come from
     *     presumably a previously existing profile
     * @param bool whether the new profile data is coming from a reset to
     *      factory settings or not
     */
    function updateProfile($directory, $new_profile_data, $old_profile_data, $reset = false)
    {
        $n = array();
        $n[] = <<<EOT
<?php
/**
 * SeekQuarry/Yioop --
 * Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 * Copyright (C) 2009-2012  Chris Pollett chris@pollett.org
 *
 * LICENSE:
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * END LICENSE
 *
 * Computer generated file giving the key defines of directory locations
 * as well as database settings used to run the SeekQuarry/Yioop search engine
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage config
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009-2012
 * @filesource
 */
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
EOT;
        $base_url = NAME_SERVER;
        if (defined("BASE_URL")) {
            $base_url = BASE_URL;
        }
        //make sure certain fields are not null
        $not_null_fields = array('BACKGROUND_COLOR' => "#FFF", 'FOREGROUND_COLOR' => "#FFF", 'SIDEBAR_COLOR' => "#8A4", 'TOPBAR_COLOR' => "#EEF", 'LOGO' => "resources/yioop.png", 'M_LOGO' => "resources/m-yioop.png", 'FAVICON' => $base_url . "favicon.ico", 'TIMEZONE' => 'America/Los_Angeles', 'SESSION_NAME' => "yioopbiscuit", 'CSRF_TOKEN' => "YIOOP_TOKEN", 'AD_LOCATION' => 'none');
        $not_null_keys = array_keys($not_null_fields);
        $file_fields = $this->file_fields;
        //now integrate the different profiles
        foreach ($this->profile_fields as $field) {
            if (isset($new_profile_data[$field])) {
                if (!$reset && in_array($field, array('LOGO', 'M_LOGO', 'FAVICON', 'SEARCHBAR_PATH', 'BACKGROUND_IMAGE'))) {
                    if (isset($new_profile_data[$field]['name']) && isset($new_profile_data[$field]['tmp_name'])) {
                        move_uploaded_file($new_profile_data[$field]['tmp_name'], APP_DIR . "/resources/" . $new_profile_data[$field]['name']);
                        $profile[$field] = "./?c=resource&amp;a=get&amp;" . "f=resources&amp;n=" . $new_profile_data[$field]['name'];
                    } else {
                        if (isset($old_profile_data[$field])) {
                            $profile[$field] = $old_profile_data[$field];
                        } else {
                            if (defined($field)) {
                                $profile[$field] = constant($field);
                            } else {
                                $profile[$field] = "";
                            }
                        }
                    }
                } else {
                    $profile[$field] = $new_profile_data[$field];
                }
            } else {
                if (isset($old_profile_data[$field])) {
                    $profile[$field] = $old_profile_data[$field];
                } else {
                    if (defined($field)) {
                        $profile[$field] = constant($field);
                    } else {
                        $profile[$field] = "";
                    }
                }
            }
            if (!$profile[$field] && isset($not_null_fields[$field])) {
                $profile[$field] = $not_null_fields[$field];
            }
            if ($field == "NEWS_MODE" && $profile[$field] == "") {
                $profile[$field] = "news_off";
            }
            if ($field == "WEB_URI") {
                if (isset($_SERVER['REQUEST_URI'])) {
                    $profile[$field] = UrlParser::getPath($_SERVER['REQUEST_URI']);
                } else {
                    $profile[$field] = UrlParser::getPath(NAME_SERVER);
                }
            }
            if (in_array($field, $file_fields)) {
                continue;
            }
            if ($field != "DEBUG_LEVEL") {
                $profile[$field] = "\"{$profile[$field]}\"";
            }
            $n[] = "define('{$field}', {$profile[$field]});";
        }
        $out = implode("\n", $n);
        if (file_put_contents($directory . PROFILE_FILE_NAME, $out) !== false) {
            restore_error_handler();
            @chmod($directory . PROFILE_FILE_NAME, 0777);
            if (isset($new_profile_data['AUXILIARY_CSS'])) {
                if (!file_exists(APP_DIR . "/css")) {
                    @mkdir(APP_DIR . "/css");
                    @chmod(APP_DIR . "/css", 0777);
                }
                $css_file = APP_DIR . "/css/auxiliary.css";
                file_put_contents($css_file, $new_profile_data['AUXILIARY_CSS']);
                @chmod($css_file, 0777);
            }
            set_error_handler("yioop_error_handler");
            return true;
        }
        return false;
    }
Пример #8
0
 /**
  * Handles admin request related to the search sources activity
  *
  * The search sources activity allows a user to add/delete search sources
  * for video and news, it also allows a user to control which subsearches
  * appear on the SearchView page
  *
  * @return array $data info about current search sources, and current
  *     sub-searches
  */
 function searchSources()
 {
     $parent = $this->parent;
     $crawl_model = $parent->model("crawl");
     $source_model = $parent->model("source");
     $possible_arguments = array("addsource", "deletesource", "addsubsearch", "deletesubsearch", "editsource", "editsubsearch");
     $data = array();
     $data["ELEMENT"] = "searchsources";
     $data['SCRIPT'] = "";
     $data['SOURCE_TYPES'] = array(-1 => tl('crawl_component_media_kind'), "video" => tl('crawl_component_video'), "rss" => tl('crawl_component_rss_feed'), "html" => tl('crawl_component_html_feed'));
     $source_type_flag = false;
     if (isset($_REQUEST['type']) && in_array($_REQUEST['type'], array_keys($data['SOURCE_TYPES']))) {
         $data['SOURCE_TYPE'] = $_REQUEST['type'];
         $source_type_flag = true;
     } else {
         $data['SOURCE_TYPE'] = -1;
     }
     $machine_urls = $parent->model("machine")->getQueueServerUrls();
     $search_lists = $crawl_model->getCrawlList(false, true, $machine_urls);
     $data["SEARCH_LISTS"] = array(-1 => tl('crawl_component_sources_indexes'));
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["i:" . $item["CRAWL_TIME"]] = $item["DESCRIPTION"];
     }
     if (isset($_SESSION['USER_ID'])) {
         $user = $_SESSION['USER_ID'];
     } else {
         $user = $_SERVER['REMOTE_ADDR'];
     }
     $search_lists = $crawl_model->getMixList($user);
     foreach ($search_lists as $item) {
         $data["SEARCH_LISTS"]["m:" . $item["TIMESTAMP"]] = $item["NAME"];
     }
     $n = NUM_RESULTS_PER_PAGE;
     $data['PER_PAGE'] = array($n => $n, 2 * $n => 2 * $n, 5 * $n => 5 * $n, 10 * $n => 10 * $n);
     if (isset($_REQUEST['per_page']) && in_array($_REQUEST['per_page'], array_keys($data['PER_PAGE']))) {
         $data['PER_PAGE_SELECTED'] = $_REQUEST['per_page'];
     } else {
         $data['PER_PAGE_SELECTED'] = NUM_RESULTS_PER_PAGE;
     }
     $locales = $parent->model("locale")->getLocaleList();
     $data["LANGUAGES"] = array();
     foreach ($locales as $locale) {
         $data["LANGUAGES"][$locale['LOCALE_TAG']] = $locale['LOCALE_NAME'];
     }
     if (isset($_REQUEST['language']) && in_array($_REQUEST['language'], array_keys($data["LANGUAGES"]))) {
         $data['SOURCE_LOCALE_TAG'] = $_REQUEST['language'];
     } else {
         $data['SOURCE_LOCALE_TAG'] = DEFAULT_LOCALE;
     }
     $data["CURRENT_SOURCE"] = array("name" => "", "type" => $data['SOURCE_TYPE'], "source_url" => "", "aux_info" => "", 'channel_path' => "", 'item_path' => "", 'title_path' => "", 'description_path' => "", 'link_path' => "", "language" => $data['SOURCE_LOCALE_TAG']);
     $data["CURRENT_SUBSEARCH"] = array("locale_string" => "", "folder_name" => "", "index_identifier" => "", "per_page" => $data['PER_PAGE_SELECTED']);
     $data['SOURCE_FORM_TYPE'] = "addsource";
     $data["SEARCH_FORM_TYPE"] = "addsubsearch";
     if (isset($_REQUEST['arg']) && in_array($_REQUEST['arg'], $possible_arguments)) {
         switch ($_REQUEST['arg']) {
             case "addsource":
                 if (!$source_type_flag) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_source_type') . "</h1>');";
                     break;
                 }
                 $must_have = array("name", "type", 'source_url');
                 $is_html_feed = false;
                 if (isset($_REQUEST['type']) && $_REQUEST['type'] == 'html') {
                     $is_html_feed = true;
                     $must_have = array_merge($must_have, array('channel_path', 'item_path', 'title_path', 'description_path', 'link_path'));
                 }
                 $to_clean = array_merge($must_have, array('aux_info', 'language'));
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if ($clean_me == "source_url") {
                         $r[$clean_me] = UrlParser::canonicalLink($r[$clean_me], NAME_SERVER);
                         echo $r[$clean_me] . "\n";
                         if (!$r[$clean_me]) {
                             $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_invalid_url') . "</h1>');";
                             break 2;
                         }
                     }
                     if (in_array($clean_me, $must_have) && $r[$clean_me] == "") {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 if ($is_html_feed) {
                     $r['aux_info'] = $r['channel_path'] . "###" . $r['item_path'] . "###" . $r['title_path'] . "###" . $r['description_path'] . "###" . $r['link_path'];
                 }
                 $source_model->addMediaSource($r['name'], $r['type'], $r['source_url'], $r['aux_info'], $r['language']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_added') . "</h1>');";
                 break;
             case "addsubsearch":
                 $to_clean = array("folder_name", 'index_identifier');
                 $must_have = $to_clean;
                 foreach ($to_clean as $clean_me) {
                     $r[$clean_me] = isset($_REQUEST[$clean_me]) ? trim($parent->clean($_REQUEST[$clean_me], "string")) : "";
                     if (in_array($clean_me, $must_have) && ($r[$clean_me] == "" || $r[$clean_me] == -1)) {
                         $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_missing_fields') . "</h1>');";
                         break 2;
                     }
                 }
                 $source_model->addSubsearch($r['folder_name'], $r['index_identifier'], $data['PER_PAGE_SELECTED']);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_added') . "</h1>');";
                 break;
             case "deletesource":
                 if (!isset($_REQUEST['ts'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $timestamp = $parent->clean($_REQUEST['ts'], "string");
                 $source_model->deleteMediaSource($timestamp);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_deleted') . "</h1>');";
                 break;
             case "deletesubsearch":
                 if (!isset($_REQUEST['fn'])) {
                     $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_no_delete_source') . "</h1>');";
                     break;
                 }
                 $folder_name = $parent->clean($_REQUEST['fn'], "string");
                 $source_model->deleteSubsearch($folder_name);
                 $data['SCRIPT'] .= "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_deleted') . "</h1>');";
                 break;
             case "editsubsearch":
                 $data['SEARCH_FORM_TYPE'] = "editsubsearch";
                 $subsearch = false;
                 $folder_name = isset($_REQUEST['fn']) ? $parent->clean($_REQUEST['fn'], "string") : "";
                 if ($folder_name) {
                     $subsearch = $source_model->getSubsearch($folder_name);
                 }
                 if (!$subsearch) {
                     $data['SOURCE_FORM_TYPE'] = "addsubsearch";
                     break;
                 }
                 $data['fn'] = $folder_name;
                 $update = false;
                 foreach ($data['CURRENT_SUBSEARCH'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $subsearch[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         $update = true;
                     } else {
                         if (isset($subsearch[$upper_field])) {
                             $data['CURRENT_SUBSEARCH'][$field] = $subsearch[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     $source_model->updateSubsearch($subsearch);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_subsearch_updated') . "</h1>');";
                 }
                 break;
             case "editsource":
                 $data['SOURCE_FORM_TYPE'] = "editsource";
                 $source = false;
                 $timestamp = isset($_REQUEST['ts']) ? $parent->clean($_REQUEST['ts'], "string") : "";
                 if ($timestamp) {
                     $source = $source_model->getMediaSource($timestamp);
                 }
                 if (!$source) {
                     $data['SOURCE_FORM_TYPE'] = "addsource";
                     break;
                 }
                 $data['ts'] = $timestamp;
                 $update = false;
                 $is_html_feed = false;
                 if ($source['TYPE'] == 'html') {
                     $is_html_feed = true;
                     list($source['CHANNEL_PATH'], $source['ITEM_PATH'], $source['TITLE_PATH'], $source['DESCRIPTION_PATH'], $source['LINK_PATH']) = explode("###", $source['AUX_INFO']);
                 }
                 foreach ($data['CURRENT_SOURCE'] as $field => $value) {
                     $upper_field = strtoupper($field);
                     if (isset($_REQUEST[$field]) && $field != 'name') {
                         $source[$upper_field] = $parent->clean($_REQUEST[$field], "string");
                         $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         $update = true;
                     } else {
                         if (isset($source[$upper_field])) {
                             $data['CURRENT_SOURCE'][$field] = $source[$upper_field];
                         }
                     }
                 }
                 if ($update) {
                     if ($is_html_feed) {
                         $source['AUX_INFO'] = $source['CHANNEL_PATH'] . "###" . $source['ITEM_PATH'] . "###" . $source['TITLE_PATH'] . "###" . $source['DESCRIPTION_PATH'] . "###" . $source['LINK_PATH'];
                     }
                     unset($source['CHANNEL_PATH']);
                     unset($source['ITEM_PATH']);
                     unset($source['TITLE_PATH']);
                     unset($source['DESCRIPTION_PATH']);
                     unset($source['LINK_PATH']);
                     $source_model->updateMediaSource($source);
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('crawl_component_media_source_updated') . "</h1>');";
                 }
                 break;
         }
     }
     $data['CAN_LOCALIZE'] = $parent->model("user")->isAllowedUserActivity($_SESSION['USER_ID'], "manageLocales");
     $parent->pagingLogic($data, $source_model, "MEDIA_SOURCES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("NAME", "", "", "ASC")));
     $parent->pagingLogic($data, $source_model, "SUBSEARCHES", DEFAULT_ADMIN_PAGING_NUM / 5, array(array("FOLDER_NAME", "", "", "ASC")), "SUB", "SUBSEARCH");
     foreach ($data["SUBSEARCHES"] as $search) {
         if (!isset($data["SEARCH_LISTS"][trim($search['INDEX_IDENTIFIER'])])) {
             $source_model->deleteSubsearch($search["FOLDER_NAME"]);
         }
     }
     $data['SCRIPT'] .= "source_type = elt('source-type');" . "source_type.onchange = switchSourceType;" . "switchSourceType()";
     return $data;
 }
Пример #9
0
 /**
  * Checks if the $url is from a site which has an hourly quota to download.
  * If so, it bumps the quota count and return true; false otherwise.
  * This method also resets the quota queue every over
  *
  * @param string $url to check if within quota
  * @return bool whether $url exceeds the hourly quota of the site it is from
  */
 function withinQuota($url)
 {
     if (!($site = UrlParser::urlMemberSiteArray($url, $this->quota_sites_keys, "q" . $this->allow_disallow_cache_time, true))) {
         return true;
     }
     list($quota, $current_count) = $this->quota_sites[$site];
     if ($current_count < $quota) {
         $this->quota_sites[$site] = array($quota, $current_count + 1);
         $flag = true;
     } else {
         $flag = false;
     }
     if ($this->quota_clear_time + ONE_HOUR < time()) {
         $this->quota_clear_time = time();
         foreach ($this->quota_sites as $site => $info) {
             list($quota, ) = $info;
             $this->quota_sites[$site] = array($quota, 0);
         }
     }
     return $flag;
 }
Пример #10
0
 /**
  * Used to handle data from the suggest-a-url to crawl form
  * (suggest_view.php). Basically, it saves any data submitted to
  * a file which can then be imported in manageCrawls
  *
  * @return array $data contains fields with the current value for
  *     the url (if set but not submitted) as well as for a captcha
  */
 function suggestUrl()
 {
     $data["REFRESH"] = "suggest";
     $visitor_model = $this->model("visitor");
     $clear = false;
     if (CAPTCHA_MODE != IMAGE_CAPTCHA) {
         unset($_SESSION["captcha_text"]);
     }
     if (CAPTCHA_MODE != TEXT_CAPTCHA) {
         unset($_SESSION['CAPTCHA']);
         unset($_SESSION['CAPTCHA_ANSWERS']);
     }
     if (CAPTCHA_MODE != HASH_CAPTCHA) {
         $num_captchas = self::NUM_CAPTCHA_QUESTIONS;
         unset($_SESSION["request_time"]);
         unset($_SESSION["level"]);
         unset($_SESSION["random_string"]);
     } else {
         $data['INCLUDE_SCRIPTS'] = array("sha1", "hash_captcha");
     }
     if (!isset($_SESSION['BUILD_TIME']) || !isset($_REQUEST['build_time']) || $_SESSION['BUILD_TIME'] != $_REQUEST['build_time'] || $this->clean($_REQUEST['build_time'], "int") <= 0) {
         if (CAPTCHA_MODE == HASH_CAPTCHA) {
             $time = time();
             $_SESSION["request_time"] = $time;
             $_SESSION["level"] = self::HASH_CAPTCHA_LEVEL;
             $_SESSION["random_string"] = md5($time . AUTH_KEY);
         }
         $clear = true;
         if (isset($_REQUEST['url'])) {
             unset($_REQUEST['url']);
         }
         if (isset($_REQUEST['arg'])) {
             unset($_REQUEST['arg']);
         }
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
     } else {
         $data['build_time'] = $_SESSION['BUILD_TIME'];
     }
     $data['url'] = "";
     if (isset($_REQUEST['url'])) {
         $data['url'] = $this->clean($_REQUEST['url'], "string");
     }
     $missing = array();
     $save = isset($_REQUEST['arg']) && $_REQUEST['arg'];
     if (CAPTCHA_MODE == TEXT_CAPTCHA) {
         for ($i = 0; $i < $num_captchas; $i++) {
             $data["question_{$i}"] = "-1";
             if ($clear && isset($_REQUEST["question_{$i}"])) {
                 unset($_REQUEST["question_{$i}"]);
             }
         }
         if (!isset($_SESSION['CAPTCHA']) || !isset($_SESSION['CAPTCHA_ANSWERS'])) {
             list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
             $data['CAPTCHA'] = $captchas;
             $data['build_time'] = time();
             $_SESSION['BUILD_TIME'] = $data['build_time'];
             $_SESSION['CAPTCHA_ANSWERS'] = $answers;
             $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
         } else {
             $data['CAPTCHA'] = $_SESSION['CAPTCHA'];
         }
         for ($i = 0; $i < $num_captchas; $i++) {
             $field = "question_{$i}";
             $captchas = isset($_SESSION['CAPTCHA'][$i]) ? $_SESSION['CAPTCHA'][$i] : array();
             if ($save) {
                 if (!isset($_REQUEST[$field]) || $_REQUEST[$field] == "-1" || !in_array($_REQUEST[$field], $captchas)) {
                     $missing[] = $field;
                 } else {
                     $data[$field] = $_REQUEST[$field];
                 }
             }
         }
     }
     $data['MISSING'] = $missing;
     $fail = false;
     if (CAPTCHA_MODE == IMAGE_CAPTCHA && !$save) {
         $this->setupGraphicalCaptchaViewData($data);
     }
     if ($save && isset($_REQUEST['url'])) {
         $url = $this->clean($_REQUEST['url'], "string");
         $url_parts = @parse_url($url);
         if (!isset($url_parts['scheme'])) {
             $url = "http://" . $url;
         }
         $suggest_host = UrlParser::getHost($url);
         $scheme = UrlParser::getScheme($url);
         if (strlen($suggest_host) < 12 || !$suggest_host || !in_array($scheme, array("http", "https"))) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_invalid_url') . "</h1>');";
             $fail = true;
         } else {
             if ($missing != array()) {
                 $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_error_fields') . "</h1>');";
                 $fail = true;
             }
         }
         if (CAPTCHA_MODE == IMAGE_CAPTCHA && $fail) {
             $this->setupGraphicalCaptchaViewData($data);
         }
         if ($fail) {
             return $data;
         }
         switch (CAPTCHA_MODE) {
             case HASH_CAPTCHA:
                 if (!$this->validateHashCode()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_hashcode') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 break;
             case TEXT_CAPTCHA:
                 $fail = false;
                 if (!$this->checkCaptchaAnswers()) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_human') . "</h1>');";
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     $data['build_time'] = time();
                     $_SESSION['BUILD_TIME'] = $data['build_time'];
                     $fail = true;
                 }
                 for ($i = 0; $i < $num_captchas; $i++) {
                     $data["question_{$i}"] = "-1";
                 }
                 list($captchas, $answers) = $this->selectQuestionsAnswers($this->captchas_qa, $num_captchas, self::NUM_CAPTCHA_CHOICES);
                 $data['CAPTCHA'] = $captchas;
                 $_SESSION['CAPTCHA_ANSWERS'] = $answers;
                 $_SESSION['CAPTCHA'] = $data['CAPTCHA'];
                 if ($fail) {
                     return $data;
                 }
                 break;
             case IMAGE_CAPTCHA:
                 $user_captcha_text = isset($_REQUEST['user_captcha_text']) ? $this->clean($_REQUEST['user_captcha_text'], "string") : "";
                 if (isset($_SESSION['captcha_text']) && $_SESSION['captcha_text'] != trim($user_captcha_text)) {
                     $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_failed_graphical_human') . "</h1>');";
                     unset($_SESSION['captcha_text']);
                     $this->setupGraphicalCaptchaViewData($data);
                     $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "captcha_time_out");
                     return $data;
                 }
                 $this->setupGraphicalCaptchaViewData($data);
                 break;
         }
         // Handle cases where captcha was okay
         if (!$this->model("crawl")->appendSuggestSites($url)) {
             $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_suggest_full') . "</h1>');";
             return $data;
         }
         $data['SCRIPT'] = "doMessage('<h1 class=\"red\" >" . tl('register_controller_url_submitted') . "</h1>');";
         $visitor_model->updateVisitor($_SERVER['REMOTE_ADDR'], "suggest_day_exceeded", ONE_DAY, ONE_DAY, MAX_SUGGEST_URLS_ONE_DAY);
         $data['build_time'] = time();
         $_SESSION['BUILD_TIME'] = $data['build_time'];
         $data['url'] = "";
     }
     return $data;
 }
Пример #11
0
 /**
  * Checks if the given $url is allowed to be crawled based on stored
  * robots.txt info.
  * @param string $url to check
  * @return bool whether it was allowed or not
  */
 function checkRobotOkay($url)
 {
     // local cache of recent robot.txt stuff
     static $robot_cache = array();
     $cache_size = 2000;
     list($host, $path) = UrlParser::getHostAndPath($url, true, true);
     $path = urldecode($path);
     $key = crawlHash($host, true);
     if (isset($robot_cache[$key])) {
         $robot_object = $robot_cache[$key];
     } else {
         $data = $this->robot_table->lookup($key);
         $offset = unpackInt($data);
         $robot_object = $this->robot_archive->getObjects($offset, 1);
         $robot_cache[$key] = $robot_object;
         if (count($robot_cache) > $cache_size) {
             array_shift($robot_cache);
         }
     }
     $robot_paths = isset($robot_object[0][1]) ? $robot_object[0][1] : array();
     //these should have been urldecoded in RobotProcessor
     $robots_okay = true;
     $robots_not_okay = false;
     if (isset($robot_paths[CrawlConstants::DISALLOWED_SITES])) {
         $robots_not_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::DISALLOWED_SITES]);
         $robots_okay = !$robots_not_okay;
     }
     if (isset($robot_paths[CrawlConstants::ALLOWED_SITES])) {
         $robots_okay = UrlParser::isPathMemberRegexPaths($path, $robot_paths[CrawlConstants::ALLOWED_SITES]);
     }
     return $robots_okay || !$robots_not_okay;
 }
Пример #12
0
 /**
  * This method adds robots metas to or removes entirely a summary
  * produced by a text page processor or its subsclasses depending on
  * whether the summary title and description satisfy various rules
  * in $this->filter_rules
  *
  * @param array& $summary the summary data produced by the relevant page
  *     processor's handle method; modified in-place.
  * @param string $url the url where the summary contents came from
  */
 function pageSummaryProcessing(&$summary, $url)
 {
     $sites = array_keys($this->filter_rules);
     $filter_rules = $this->filter_rules;
     $rules = $filter_rules['default'] ? $filter_rules['default'] : array();
     foreach ($sites as $site) {
         if ($site == "default") {
             continue;
         }
         $sign = $site[0] == '-' ? false : true;
         if (!$sign || $site[0] == '+') {
             $check_url = substr($site, 1);
         } else {
             $check_url = $site;
         }
         if ($sign && UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url) || !$sign && !UrlParser::urlMemberSiteArray($url, array($check_url), $url . $check_url)) {
             $rules = array_merge($rules, $filter_rules[$site]);
         }
     }
     foreach ($rules as $rule) {
         $preconditions = $rule["PRECONDITIONS"];
         $actions = $rule["ACTIONS"];
         $filter_flag = $this->checkFilter($preconditions, $summary[self::TITLE], $summary[self::DESCRIPTION]);
         if ($filter_flag) {
             if (in_array("NOPROCESS", $actions)) {
                 crawlLog("  Word filter plugin removed page.");
                 $summary = false;
                 break;
             } else {
                 if (!isset($summary[self::ROBOT_METAS])) {
                     $summary[self::ROBOT_METAS] = array();
                 }
                 $summary[self::ROBOT_METAS] += $actions;
             }
         }
     }
 }
Пример #13
0
 /**
  * Used to determine if an action involves just one yioop instance on
  * the current local machine or not
  *
  * @param array $machine_urls urls of yioop instances to which the action
  *     applies
  * @param string $index_timestamp if timestamp exists checks if the index
  *     has declared itself to be a no network index.
  * @return bool whether it involves a single local yioop instance (true)
  *     or not (false)
  */
 function isSingleLocalhost($machine_urls, $index_timestamp = -1)
 {
     if ($index_timestamp >= 0) {
         $index_archive_name = self::index_data_base_name . $index_timestamp;
         if (file_exists(CRAWL_DIR . "/cache/{$index_archive_name}/no_network.txt")) {
             return true;
         }
     }
     return count($machine_urls) <= 1 && UrlParser::isLocalhostUrl($machine_urls[0]);
 }
Пример #14
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $sit  a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $hyperlink = "http://schemas.openxmlformats.org/officeDocument/2006/" . "relationships/hyperlink";
     $i = 0;
     $relationships = $dom->getElementsByTagName("Relationships");
     foreach ($relationships as $relationship) {
         $relations = $relationship->getElementsByTagName("Relationship");
         foreach ($relations as $relation) {
             if (strcmp($relation->getAttribute('Type'), $hyperlink) == 0) {
                 if ($i < MAX_LINKS_TO_EXTRACT) {
                     $link = $relation->getAttribute('Target');
                     $url = UrlParser::canonicalLink($link, $site);
                     if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN) {
                         if (isset($sites[$url])) {
                             $sites[$url] .= " " . $link;
                         } else {
                             $sites[$url] = $link;
                         }
                         $i++;
                     }
                 }
             }
         }
     }
     return $sites;
 }
Пример #15
0
 /**
  * Returns up to MAX_LINK_PER_PAGE many links from the supplied
  * dom object where links have been canonicalized according to
  * the supplied $site information.
  *
  * @param object $dom a document object with links on it
  * @param string $site a string containing a url
  *
  * @return array links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $paras = $xpath->evaluate("/p:sld//p:cSld//p:spTree//p:sp//\n            p:txBody//a:p//a:r//a:rPr//a:hlinkClick");
     $i = 0;
     foreach ($paras as $para) {
         if ($i < MAX_LINKS_TO_EXTRACT) {
             $hlink = $para->parentNode->parentNode->getElementsByTagName("t")->item(0)->nodeValue;
             $url = UrlParser::canonicalLink($hlink, $site);
             $len = strlen($url);
             if (!UrlParser::checkRecursiveUrl($url) && strlen($url) < MAX_URL_LEN && $len > 0) {
                 if (isset($sites[$url])) {
                     $sites[$url] .= " " . $hlink;
                 } else {
                     $sites[$url] = $hlink;
                 }
             }
         }
         $i++;
     }
     return $sites;
 }
Пример #16
0
 /**
  * Tries to determine the language of the document by looking at the
  * $sample_text and $url provided
  * the language
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function calculateLang($sample_text = NULL, $url = NULL)
 {
     if ($url != NULL) {
         $lang = UrlParser::getDocumentType($url);
     }
     return $lang;
 }
Пример #17
0
 /**
  * Returns links from the supplied dom object of a sitemap
  * where links have been canonicalized according to
  * the supplied $site information. We allow more links from a sitemap
  * than from other kinds of documents. For now we are ignoring weighting
  * info
  *
  * @param object $dom   a document object with links on it
  * @param string $site   a string containing a url
  *
  * @return array   links from the $dom object
  */
 static function links($dom, $site)
 {
     $sites = array();
     $xpath = new DOMXPath($dom);
     $xpath->registerNamespace('s', "http://www.sitemaps.org/schemas/sitemap/0.9");
     $paths = array("/s:urlset/s:url/s:loc", "/s:sitemapindex/s:sitemap/s:loc");
     $i = 0;
     foreach ($paths as $path) {
         $nodes = @$xpath->evaluate($path);
         foreach ($nodes as $node) {
             $url = UrlParser::canonicalLink($node->textContent, $site);
             if ($url === NULL || $url === "" || UrlParser::checkRecursiveUrl($url) || UrlParser::getDocumentType($url) == "gz" || strlen($url) >= MAX_URL_LEN) {
                 //at this point we can't handle gzip'd sitemaps
                 continue;
             }
             $sites[$url] = "From sitemap of " . $site;
             $i++;
             if ($i > MAX_LINKS_PER_SITEMAP) {
                 break 2;
             }
         }
     }
     return $sites;
 }
 /**
  * Parses argument data
  *
  * @param array $data
  */
 private function parseData($data = array())
 {
     // show help if requested and exit!
     if (isset($data['help'])) {
         require_once MANGA_ROOT_DIR . 'includes/templates/help/index.php';
         exit;
     }
     $data = is_array($data) ? $data : array();
     // image delay
     $this->setImageDelay(Input::array_value($data, 'image-delay', '', 'trim'));
     // chapter delay
     $this->setChapterDelay(Input::array_value($data, 'chapter-delay', '', 'trim'));
     // url
     if (isset($data['url'])) {
         $url = trim($data['url']);
         if ($url == '') {
             consoleLineError("Url parameter cannot be empty!");
             exit;
         }
         $parsedData = UrlParser::parseUrl($url);
         if (!$parsedData) {
             consoleLineError("Provided url is not is not valid!");
             exit;
         } else {
             $data['source'] = $parsedData['source'];
             $data['slug'] = $parsedData['slug'];
             $chapter = trim($parsedData['chapter']);
             if ($chapter != '') {
                 $data['chapter-ids'] = $chapter;
                 $data['action'] = self::ACTION_SPECIFIC_CHAPTERS;
             }
         }
     }
     // check for valid params
     $dataKeys = array_keys($data);
     $diff = array_diff($dataKeys, $this->_allowed_param_names);
     if (count($diff) > 0) {
         consoleLineError("Invalid params: " . join(',', $diff), 2);
         exit;
     }
     $this->_argumentsList = $data;
     // action
     $action = Input::array_value($data, 'action', '', 'trim');
     if ($action == '') {
         $action = self::ACTION_NEW_CHAPTERS;
     }
     if (!$this->isValidAction($action)) {
         $this->displayInvalidActionMessage(TRUE);
     } else {
         $this->_action = $action;
         if ($this->_action == self::ACTION_SPECIFIC_CHAPTERS) {
             $chapterIds = Input::array_value($data, 'chapter-ids', '', 'trim');
             if ($chapterIds == '') {
                 consoleLineError('One or more chapter ids are required when action is "' . self::ACTION_SPECIFIC_CHAPTERS . '"');
                 Console::emptyLines();
                 exit;
             }
         }
     }
     // source
     $source = Input::array_value($data, 'source', MangaSourceList::SOUCE_MANGAPANDA, 'trim');
     if (MangaSourceList::getInstance()->isValidSource($source)) {
         $this->_source = $source;
     } else {
         MangaSourceList::getInstance()->displayInvalidMangaSourceMessage(TRUE);
     }
     // slug
     $slug = Input::array_value($data, 'slug', '', 'trim');
     if ($slug == '') {
         consoleLineError('Manga slug is required!', 2);
         consoleLinePurple('Example: --slug=nisekoi', 2);
         Console::writeMultiline('Slug usualy means the SEO friendly name of the manga. But it can be different for different manga sources.The slug is part of the manga chapters list url.');
         consoleLineInfo('');
         exit;
     }
     $this->_mangaSlug = $slug;
     // name
     $name = Input::array_value($data, 'name', '', 'trim');
     if ($name == '') {
         $name = $this->_mangaSlug;
     }
     $this->_mangaName = $name;
     // Output dir
     $output_dir = Input::array_value($data, 'output-dir', '', 'trim');
     if ($output_dir == '') {
         $output_dir = './manga/' . $this->_source . '/' . $this->_mangaSlug . '/';
     }
     if (!is_dir($output_dir)) {
         if (!mkdir($output_dir, 0777, TRUE)) {
             consoleLineError("Unable to create output dir: " . $output_dir, 2);
             consoleLineInfo('');
             exit;
         }
     } else {
         $tmpFile = tempnam($output_dir, 'mst-');
         if (!fopen($tmpFile, 'w')) {
             consoleLineError("Output dir is not writeable!" . $output_dir, 2);
             consoleLineInfo('');
             exit;
         } else {
             @unlink($tmpFile);
         }
     }
     $this->_output_dir = $output_dir;
     # chapters count
     $chaptersCount = Input::array_value_as_int($data, 'chapters-count', 0);
     if ($chaptersCount < 0) {
         $chaptersCount = 0;
     }
     $this->_chapters_count = $chaptersCount;
     # chapter ids
     $chapterIds = Input::array_value($data, 'chapter-ids', '', 'trim');
     if ($chapterIds == '') {
         $this->_chapter_ids = array();
     } else {
         // is it a file?
         if (is_readable($chapterIds)) {
             $chapterIds = trim(file_get_contents($chapterIds));
         }
         $chapterIds = explode(',', $chapterIds);
         $chapterIds = array_map('trim', $chapterIds);
         // check for ranges
         $chapterRangesIds = array();
         foreach ($chapterIds as $k => $v) {
             $cid = $chapterIds[$k];
             if (preg_match('/([0-9.]+)\\s*-\\s*([0-9.]+)/im', $cid, $regs)) {
                 $chapterRangesIds[$k] = array('start' => $regs[1], 'end' => $regs[2]);
             }
         }
         if (count($chapterRangesIds) > 0) {
             // unset the range format entries first, as we are gonna get real
             // chapter ids from that range next
             foreach ($chapterRangesIds as $k => $rangeData) {
                 unset($chapterIds[$k]);
             }
             // get available chapters from ranges
             foreach ($chapterRangesIds as $k => $rangeData) {
                 $start = $rangeData['start'];
                 $end = $rangeData['end'];
                 for ($i = $start; $i <= $end; $i += 1) {
                     $chapterIds[] = $i;
                 }
             }
         }
         asort($chapterIds);
         $chapterIds = array_unique($chapterIds);
         $this->_chapter_ids = $chapterIds;
     }
     # create cbr
     $createCbr = isset($data['create-cbr']) ? $data['create-cbr'] : TRUE;
     $result = strtolower(exec('type -p rar'));
     if (strpos($result, 'not found')) {
         consoleLineError('rar doesnt seem to be installed in the system!');
         $createCbr = FALSE;
     }
     $this->_create_cbr = $createCbr;
     if (!$this->_create_cbr) {
         consoleLineError('.cbr files will not be created!');
     }
     # no cbr backup
     if ($this->_action == self::ACTION_RECREATE_CBR) {
         $this->_no_cbr_backup = isset($data['no-cbr-backup']) && $data['no-cbr-backup'];
     }
 }
Пример #19
0
 /**
  * Tries to determine the language of the document by looking at the
  * $sample_text and $url provided
  * the language
  * @param string $sample_text sample text to try guess the language from
  * @param string $url url of web-page as a fallback look at the country
  *     to figure out language
  *
  * @return string language tag for guessed language
  */
 static function calculateLang($sample_text = NULL, $url = NULL)
 {
     if ($url != NULL) {
         $lang = UrlParser::getLang($url);
         if ($lang != NULL) {
             return $lang;
         }
     }
     if ($sample_text != NULL) {
         $words = mb_split("[[:space:]]|" . PUNCT, $sample_text);
         $num_words = count($words);
         $ascii_count = 0;
         foreach ($words as $word) {
             if (strlen($word) == mb_strlen($word)) {
                 $ascii_count++;
             }
         }
         // crude, but let's guess ASCII == english
         if ($ascii_count / $num_words > EN_RATIO) {
             $lang = 'en';
         } else {
             $lang = NULL;
         }
     } else {
         $lang = NULL;
     }
     return $lang;
 }
Пример #20
0
<?php

require_once './define.php';
require_once DIREC . '/common/common.php';
$parse = new UrlParser();
$url = $parse->urlParse($_SERVER['PATH_INFO']);
$main = new MainController();
$main->run($url);
Пример #21
0
 /**
  * Implements post processing of recipes. recipes are extracted
  * ingredients are scrubbed and recipes are clustered. The clustered
  * recipes are added back to the index.
  *
  * @param string $index_name  index name of the current crawl.
  */
 function postProcessing($index_name)
 {
     global $INDEXING_PLUGINS;
     if (!class_exists("SplHeap")) {
         crawlLog("...Recipe Plugin Requires SPLHeap for clustering!");
         crawlLog("...Aborting plugin");
         return;
     }
     $locale_tag = guessLocale();
     setLocaleObject($locale_tag);
     $search_controller = new SearchController($INDEXING_PLUGINS);
     $query = "recipe:all i:{$index_name}";
     crawlLog("...Running Recipe Plugin!");
     crawlLog("...Finding docs tagged as recipes.");
     $more_docs = true;
     $raw_recipes = array();
     $limit = 0;
     $num = 100;
     while ($more_docs) {
         $results = @$search_controller->queryRequest($query, $num, $limit, 1, $index_name);
         if (isset($results["PAGES"]) && ($num_results = count($results["PAGES"])) > 0) {
             $raw_recipes = array_merge($raw_recipes, $results["PAGES"]);
         }
         crawlLog("Scanning recipes {$limit} through " . ($limit + $num_results) . ".");
         $limit += $num_results;
         if (isset($results["SAVE_POINT"])) {
             $end = true;
             foreach ($results["SAVE_POINT"] as $save_point) {
                 if ($save_point != -1) {
                     $end = false;
                 }
             }
             if ($end) {
                 $more_docs = false;
             }
         } else {
             $more_docs = false;
         }
     }
     crawlLog("...Clustering.");
     // only cluster if would make more than one cluster
     if (count($raw_recipes) * CLUSTER_RATIO > 1) {
         $recipes = array();
         $i = 0;
         foreach ($raw_recipes as $raw_recipe) {
             $description = $raw_recipe[self::DESCRIPTION];
             $ingredients = explode("||", $description);
             if (is_array($ingredients) && count($ingredients) > 1) {
                 $recipes[$i][0] = $raw_recipe[self::TITLE];
                 $recipes[$i][1] = $ingredients;
                 $recipes[$i][2] = crawlHash($raw_recipe[self::URL]);
                 $recipes[$i][3] = $raw_recipe;
                 $i++;
             }
         }
         $recipes_ingredients = array();
         $count = count($recipes);
         foreach ($recipes as $key => $recipe) {
             foreach ($recipe[1] as $index => $ingredient) {
                 if (strlen($ingredient) != 0 && substr($ingredient, strlen($ingredient) - 1) != ":") {
                     $mainIngredient = $this->getIngredientName((string) $ingredient);
                     if (strlen($mainIngredient) != 0) {
                         $recipe[1][$index] = $mainIngredient;
                     } else {
                         unset($recipe[1][$index]);
                     }
                 } else {
                     unset($recipe[1][$index]);
                 }
             }
             $recipes[$key] = $recipe;
         }
         $count = count($recipes);
         $k = 0;
         $basic_ingredients = array('onion', 'oil', 'cheese', 'pepper', 'sauce', 'salt', 'milk', 'butter', 'flour', 'cake', 'garlic', 'cream', 'soda', 'honey', 'powder', 'sauce', 'water', 'vanilla', 'pepper', 'bread', 'sugar', 'vanillaextract', 'celery', 'seasoning', 'syrup', 'skewers', 'egg', 'muffin', 'ginger', 'basil', 'oregano', 'cinammon', 'cumin', 'mayonnaise', 'mayo', 'chillipowder', 'lemon', 'greens', 'yogurt', 'margarine', 'asparagus', 'halfhalf', 'pancakemix', 'coffee', 'cookies', 'lime', 'chillies', 'cilantro', 'rosemary', 'vanillaextract', 'vinegar', 'shallots', 'wine', 'cornmeal', 'nonstickspray');
         for ($i = 0; $i < $count; $i++) {
             $recipe1_main_ingredient = "";
             $recipe1 = $recipes[$i][1];
             $recipe_name = $recipes[$i][0];
             $recipe1_title = strtolower($recipes[$i][0]);
             $distinct_ingredients[$recipe_name] = $recipes[$i][1];
             $doc_keys[$recipe_name] = $recipes[$i][2];
             $recipes_summary[$recipe_name] = $recipes[$i][3];
             for ($j = $i + 1; $j < $count; $j++) {
                 $recipe2_main_ingredient = "";
                 $recipe2 = $recipes[$j][1];
                 $recipe2_title = strtolower($recipes[$j][0]);
                 $weights[$k][0] = $recipes[$i][0];
                 $weights[$k][1] = $recipes[$j][0];
                 $merge_array = array_merge($recipe1, $recipe2);
                 $vector_array = array_unique($merge_array);
                 sort($vector_array);
                 $recipe1_vector = array_fill_keys($vector_array, 0);
                 $recipe2_vector = array_fill_keys($vector_array, 0);
                 foreach ($recipe1 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe1_title, $ingredient)) {
                             $recipe1_main_ingredient = $ingredient;
                         }
                     }
                     $recipe1_vector[$ingredient] = 1;
                 }
                 foreach ($recipe2 as $ingredient) {
                     if ($ingredient != "" && !in_array($ingredient, $basic_ingredients)) {
                         if (strstr($recipe2_title, $ingredient)) {
                             $recipe2_main_ingredient = $ingredient;
                         }
                     }
                     $recipe2_vector[$ingredient] = 1;
                 }
                 $edge_weight = 0;
                 $matches = 1;
                 foreach ($vector_array as $vector) {
                     $diff = $recipe1_vector[$vector] - $recipe2_vector[$vector];
                     $vector_diff[$vector] = pow($diff, 2);
                     if (abs($diff) == 1) {
                         $matches += 1;
                     }
                     $edge_weight += $vector_diff[$vector];
                 }
                 $main_ingredient_match = 1;
                 if ($recipe1_main_ingredient != $recipe2_main_ingredient) {
                     $main_ingredient_match = 1000;
                 }
                 $edge_weight = sqrt($edge_weight) * $matches * $main_ingredient_match;
                 $weights[$k][2] = $edge_weight;
                 $k++;
             }
         }
         crawlLog("...Making new shard with clustered recipes as docs.");
         $clusters = kruskalClustering($weights, $count, $distinct_ingredients);
         $index_shard = new IndexShard("cluster_shard");
         $word_lists = array();
         $recipe_sites = array();
         foreach ($clusters as $cluster) {
             $count = count($cluster);
             for ($i = 0; $i < $count - 1; $i++) {
                 $meta_ids = array();
                 $summary = array();
                 $recipe = $cluster[$i];
                 $summary[self::URL] = $recipes_summary[$recipe][self::URL];
                 $summary[self::TITLE] = $recipes_summary[$recipe][self::TITLE];
                 $summary[self::DESCRIPTION] = $recipes_summary[$recipe][self::DESCRIPTION];
                 $summary[self::TIMESTAMP] = $recipes_summary[$recipe][self::TIMESTAMP];
                 $summary[self::ENCODING] = $recipes_summary[$recipe][self::ENCODING];
                 $summary[self::HASH] = $recipes_summary[$recipe][self::HASH];
                 $doc_keys[$recipe] = crawlHash($summary[self::URL], true);
                 $hash_rhost = "r" . substr(crawlHash(UrlParser::getHost($summary[self::URL]) . "/", true), 1);
                 $doc_keys[$recipe] .= $summary[self::HASH] . $hash_rhost;
                 $summary[self::TYPE] = $recipes_summary[$recipe][self::TYPE];
                 $summary[self::HTTP_CODE] = $recipes_summary[$recipe][self::HTTP_CODE];
                 $recipe_sites[] = $summary;
                 $meta_ids[] = "ingredient:" . trim($cluster["ingredient"]);
                 crawlLog("ingredient:" . $cluster["ingredient"]);
                 if (!$index_shard->addDocumentWords($doc_keys[$recipe], self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, true, false)) {
                     crawlLog("Problem inserting recipe: " . $summary[self::TITLE]);
                 }
             }
         }
         $shard_string = $index_shard->save(true);
         $index_shard = IndexShard::load("cluster_shard", $shard_string);
         unset($shard_string);
         crawlLog("...Adding recipe shard to index archive bundle");
         $dir = CRAWL_DIR . "/cache/" . self::index_data_base_name . $index_name;
         $index_archive = new IndexArchiveBundle($dir, false);
         if ($index_shard->word_docs_packed) {
             $index_shard->unpackWordDocs();
         }
         $generation = $index_archive->initGenerationToAdd($index_shard);
         if (isset($recipe_sites)) {
             crawlLog("... Adding " . count($recipe_sites) . " recipe docs.");
             $index_archive->addPages($generation, self::SUMMARY_OFFSET, $recipe_sites, 0);
         }
         $k = 0;
         foreach ($recipe_sites as $site) {
             $recipe = $site[self::TITLE];
             $hash = crawlHash($site[self::URL], true) . $site[self::HASH] . "r" . substr(crawlHash(UrlParser::getHost($site[self::URL]) . "/", true), 1);
             $summary_offsets[$hash] = $site[self::SUMMARY_OFFSET];
         }
         $index_shard->changeDocumentOffsets($summary_offsets);
         $index_archive->addIndexData($index_shard);
         $index_archive->saveAndAddCurrentShardDictionary();
         $index_archive->dictionary->mergeAllTiers();
         $this->db->setWorldPermissionsRecursive(CRAWL_DIR . '/cache/' . self::index_data_base_name . $index_name);
         crawlLog("...Recipe plugin finished.");
     }
 }
Пример #22
0
require_once(AT_INCLUDE_PATH.'lib/constants.inc.php');
require_once(AT_INCLUDE_PATH.'lib/mysql_connect.inc.php');
//require_once(AT_INCLUDE_PATH.'vitals.inc.php');

//mimic config variables, vitals.inc.php 135-140
/* get config variables. if they're not in the db then it uses the installation default value in constants.inc.php */
$sql    = "SELECT * FROM ".TABLE_PREFIX."config";
$result = mysql_query($sql, $db);
while ($row = mysql_fetch_assoc($result)) { 
	$_config[$row['name']] = $row['value'];
}

//Get path info
$pathinfo = getPathInfo();

$url_parser = new UrlParser($pathinfo);
$path_array =  $url_parser->getPathArray();
$_pretty_url_course_id = $path_array[0];
$obj = $path_array[1];

if (!$obj->isEmpty()){
	/* 
	 * Addresses the issue for relative uri 
	 * @refer to constants.inc.php $_rel_link
	 */
	$_rel_url = $obj->redirect();
	$var_query = $obj->parsePrettyQuery();
	save2Get($var_query);	//remake all the _GET and _REQUEST variables so that the vitals can use it

	$_user_location	= '';	//reset user_location so that the vital file in each page would validate
	$pretty_current_page = $obj->getPage();
Пример #23
0
 /**
  * 在这个方法内定义是否分析处理该 url 内容中的链接
  * @param string $url
  * @return boolean
  */
 protected function isFollowUrl($url)
 {
     return parent::isFollowUrl($url);
 }
Пример #24
0
 /**
  * Parses the contents of a robots.txt page extracting allowed,
  * disallowed paths, crawl-delay, and sitemaps. We also extract a
  * list of all user agent strings seen.
  *
  * @param string $page text string of a document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description, links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $summary = NULL;
     $summary[self::TITLE] = "";
     $summary[self::DESCRIPTION] = "";
     $summary[self::LANG] = NULL;
     $summary[self::ROBOT_PATHS] = array();
     $summary[self::AGENT_LIST] = array();
     $summary[self::LINKS] = array();
     $host_url = UrlParser::getHost($url);
     $lines = explode("\n", $page);
     $add_rule_state = false;
     $rule_added_flag = false;
     $delay_flag = false;
     $delay = 0;
     foreach ($lines as $pre_line) {
         $pre_line_parts = explode("#", $pre_line);
         $line = $pre_line_parts[0];
         $line_parts = explode(":", $line);
         if (!isset($line_parts[1])) {
             continue;
         }
         $field = array_shift($line_parts);
         $value = implode(":", $line_parts);
         //notice we lower case field, so switch below is case insensitive
         $field = strtolower(trim($field));
         $value = trim($value);
         $specificness = 0;
         if (strlen($value) == 0) {
             continue;
         }
         switch ($field) {
             case "user-agent":
                 //we allow * in user agent string
                 $summary[self::AGENT_LIST][] = $value;
                 $current_specificness = strcmp($value, USER_AGENT_SHORT) == 0 ? 1 : 0;
                 if ($current_specificness < $specificness) {
                     break;
                 }
                 if ($specificness < $current_specificness) {
                     //Give precedence to exact match on agent string
                     $specificness = $current_specificness;
                     $add_rule_state = true;
                     $summary[self::ROBOT_PATHS] = array();
                     break;
                 }
                 $agent_parts = explode("*", $value);
                 $offset = 0;
                 $add_rule_state = true;
                 foreach ($agent_parts as $part) {
                     if ($part == "") {
                         continue;
                     }
                     $new_offset = stripos(USER_AGENT_SHORT, $part, $offset);
                     if ($new_offset === false) {
                         $add_rule_state = false;
                         break;
                     }
                     $offset = $new_offset;
                 }
                 break;
             case "sitemap":
                 $tmp_url = UrlParser::canonicalLink($value, $host_url);
                 if (!UrlParser::checkRecursiveUrl($tmp_url) && strlen($tmp_url) < MAX_URL_LEN) {
                     $summary[self::LINKS][] = $tmp_url;
                 }
                 break;
             case "allow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::ALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "disallow":
                 if ($add_rule_state) {
                     $rule_added_flag = true;
                     $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = $this->makeCanonicalRobotPath($value);
                 }
                 break;
             case "crawl-delay":
                 if ($add_rule_state) {
                     $delay_flag = true;
                     $delay = max($delay, intval($value));
                 }
                 break;
         }
     }
     if ($delay_flag) {
         if ($delay > MAXIMUM_CRAWL_DELAY) {
             $summary[self::ROBOT_PATHS][self::DISALLOWED_SITES][] = "/";
         } else {
             $summary[self::CRAWL_DELAY] = $delay;
         }
     }
     $summary[self::PAGE] = "<html><body><pre>" . strip_tags($page) . "</pre></body></html>";
     return $summary;
 }
Пример #25
0
 /**
  * Builds an inverted index shard (word --> {docs it appears in})
  * for the current batch of SEEN_URLS_BEFORE_UPDATE_SCHEDULER many pages.
  * This inverted index shard is then merged by a queue_server
  * into the inverted index of the current generation of the crawl.
  * The complete inverted index for the whole crawl is built out of these
  * inverted indexes for generations. The point of computing a partial
  * inverted index on the fetcher is to reduce some of the computational
  * burden on the queue server. The resulting mini index computed by
  * buildMiniInvertedIndex() is stored in
  * $this->found_sites[self::INVERTED_INDEX]
  *
  */
 function buildMiniInvertedIndex()
 {
     $start_time = microtime();
     $keypad = "";
     crawlLog("  Start building mini inverted index ...  Current Memory:" . memory_get_usage());
     $num_seen = count($this->found_sites[self::SEEN_URLS]);
     $this->num_seen_sites += $num_seen;
     /*
         for the fetcher we are not saving the index shards so
         name doesn't matter.
     */
     if (!isset($this->found_sites[self::INVERTED_INDEX][$this->current_server])) {
         $this->found_sites[self::INVERTED_INDEX][$this->current_server] = new IndexShard("fetcher_shard_{$this->current_server}");
     }
     for ($i = 0; $i < $num_seen; $i++) {
         $interim_time = microtime();
         $site = $this->found_sites[self::SEEN_URLS][$i];
         if (!isset($site[self::HASH]) || isset($site[self::ROBOT_METAS]) && in_array("JUSTFOLLOW", $site[self::ROBOT_METAS])) {
             continue;
         }
         $doc_rank = false;
         if ($this->crawl_type == self::ARCHIVE_CRAWL && isset($this->archive_iterator)) {
             $doc_rank = $this->archive_iterator->weight($site);
         }
         if (isset($site[self::TYPE]) && $site[self::TYPE] == "link") {
             $is_link = true;
             $doc_keys = $site[self::HTTP_CODE];
             $site_url = $site[self::TITLE];
             $host = UrlParser::getHost($site_url);
             $link_parts = explode('|', $site[self::HASH]);
             if (isset($link_parts[5])) {
                 $link_origin = $link_parts[5];
             } else {
                 $link_origin = $site_url;
             }
             $meta_ids = PhraseParser::calculateLinkMetas($site_url, $host, $site[self::DESCRIPTION], $link_origin);
         } else {
             $is_link = false;
             $site_url = str_replace('|', "%7C", $site[self::URL]);
             $host = UrlParser::getHost($site_url);
             $doc_keys = crawlHash($site_url, true) . $site[self::HASH] . "d" . substr(crawlHash($host . "/", true), 1);
             $meta_ids = PhraseParser::calculateMetas($site, $this->video_sources);
         }
         $word_lists = array();
         /*
            self::JUST_METAS check to avoid getting sitemaps in results for
            popular words
         */
         $lang = NULL;
         if (!isset($site[self::JUST_METAS])) {
             $host_words = UrlParser::getWordsIfHostUrl($site_url);
             $path_words = UrlParser::getWordsLastPathPartUrl($site_url);
             if ($is_link) {
                 $phrase_string = $site[self::DESCRIPTION];
             } else {
                 if (isset($site[self::LANG])) {
                     if (isset($this->programming_language_extension[$site[self::LANG]])) {
                         $phrase_string = $site[self::DESCRIPTION];
                     } else {
                         $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                     }
                 } else {
                     $phrase_string = $host_words . " " . $site[self::TITLE] . " " . $path_words . " " . $site[self::DESCRIPTION];
                 }
             }
             if (isset($site[self::LANG])) {
                 $lang = guessLocaleFromString(mb_substr($site[self::DESCRIPTION], 0, AD_HOC_TITLE_LENGTH), $site[self::LANG]);
             }
             $word_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang);
             $len = strlen($phrase_string);
             if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) {
                 $meta_ids[] = "safe:true";
                 $safe = true;
             } else {
                 $meta_ids[] = "safe:false";
                 $safe = false;
             }
         }
         if (!$is_link) {
             //store inlinks so they can be searched by
             $num_links = count($site[self::LINKS]);
             if ($num_links > 0) {
                 $link_rank = false;
                 if ($doc_rank !== false) {
                     $link_rank = max($doc_rank - 1, 1);
                 }
             } else {
                 $link_rank = false;
             }
         }
         $num_queue_servers = count($this->queue_servers);
         if (isset($site[self::USER_RANKS]) && count($site[self::USER_RANKS]) > 0) {
             $score_keys = "";
             foreach ($site[self::USER_RANKS] as $label => $score) {
                 $score_keys .= packInt($score);
             }
             if (strlen($score_keys) % 8 != 0) {
                 $score_keys .= $keypad;
             }
             $doc_keys .= $score_keys;
         }
         $this->found_sites[self::INVERTED_INDEX][$this->current_server]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank);
         /*
            $this->no_process_links is set when doing things like
            mix recrawls. In this case links likely already will appear
            in what indexing, so don't index again. $site[self::JUST_META]
            is set when have a sitemap or robots.txt (this case set later).
            In this case link  info is not particularly useful for indexing
            and can greatly slow building inverted index.
         */
         if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) {
             foreach ($site[self::LINKS] as $url => $link_text) {
                 /* this mysterious check means won't index links from
                     robots.txt. Sitemap will still be in TO_CRAWL, but that's
                     done elsewhere
                    */
                 if (strlen($url) == 0 || is_numeric($url)) {
                     continue;
                 }
                 $link_host = UrlParser::getHost($url);
                 if (strlen($link_host) == 0) {
                     continue;
                 }
                 $part_num = calculatePartition($link_host, $num_queue_servers);
                 $summary = array();
                 if (!isset($this->found_sites[self::LINK_SEEN_URLS][$part_num])) {
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num] = array();
                 }
                 $elink_flag = $link_host != $host ? true : false;
                 $link_text = strip_tags($link_text);
                 $ref = $elink_flag ? "eref" : "iref";
                 $url = str_replace('|', "%7C", $url);
                 $link_id = "url|" . $url . "|text|" . urlencode($link_text) . "|{$ref}|" . $site_url;
                 $elink_flag_string = $elink_flag ? "e" : "i";
                 $link_keys = crawlHash($url, true) . crawlHash($link_id, true) . $elink_flag_string . substr(crawlHash($host . "/", true), 1);
                 $summary[self::URL] = $link_id;
                 $summary[self::TITLE] = $url;
                 // stripping html to be on the safe side
                 $summary[self::DESCRIPTION] = $link_text;
                 $summary[self::TIMESTAMP] = $site[self::TIMESTAMP];
                 $summary[self::ENCODING] = $site[self::ENCODING];
                 $summary[self::HASH] = $link_id;
                 $summary[self::TYPE] = "link";
                 $summary[self::HTTP_CODE] = $link_keys;
                 $summary[self::LANG] = $lang;
                 $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary;
                 $link_word_lists = PhraseParser::extractPhrasesInLists($link_text, $lang);
                 $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url);
                 if (!isset($this->found_sites[self::INVERTED_INDEX][$part_num])) {
                     $this->found_sites[self::INVERTED_INDEX][$part_num] = new IndexShard("fetcher_shard_{$part_num}");
                 }
                 $this->found_sites[self::INVERTED_INDEX][$part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, $link_meta_ids, PhraseParser::$materialized_metas, false, $link_rank);
             }
         }
         $iterim_elapse = changeInMicrotime($interim_time);
         if ($iterim_elapse > 5) {
             crawlLog("..Inverting " . $site[self::URL] . "...took > 5s.");
         }
         crawlTimeoutLog("..Still building inverted index. Have processed " . "%s of %s documents.\nLast url processed was %s.", $i, $num_seen, $site[self::URL]);
     }
     if ($this->crawl_type == self::ARCHIVE_CRAWL) {
         $this->recrawl_check_scheduler = true;
     }
     crawlLog("  Build mini inverted index time " . changeInMicrotime($start_time));
 }
Пример #26
0
 /**
  * Used to compute all the meta ids for a given link with $url
  * and $link_text that was on a site with $site_url.
  *
  * @param string $url url of the link
  * @param string $link_host url of the host name of the link
  * @param string $link_text text of the anchor tag link came from
  * @param string $site_url url of the page link was on
  */
 static function calculateLinkMetas($url, $link_host, $link_text, $site_url)
 {
     global $IMAGE_TYPES;
     $link_meta_ids = array();
     if (strlen($link_host) == 0) {
         continue;
     }
     if (substr($link_text, 0, 9) == "location:") {
         $location_link = true;
         $link_meta_ids[] = $link_text;
         $link_meta_ids[] = "location:all";
         $link_meta_ids[] = "location:" . crawlHash($site_url);
     }
     $link_type = UrlParser::getDocumentType($url);
     $link_meta_ids[] = "media:all";
     $link_meta_ids[] = "safe:all";
     if (in_array($link_type, $IMAGE_TYPES)) {
         $link_meta_ids[] = "media:image";
         if (isset($safe) && !$safe) {
             $link_meta_ids[] = "safe:false";
         }
     } else {
         $link_meta_ids[] = "media:text";
     }
     $link_meta_ids[] = "link:all";
     return $link_meta_ids;
 }
Пример #27
0
/**
 * If pretty URL within admin config is switched on.  We will apply pretty URL 
 * to all the links in ATutor.  This function will authenticate itself towards the current pages.
 * In our definition, admins, login, registration pages shouldn't have pretty url applied.  However,
 * if one want to use url_rewrite on these pages, please force it by using the third parameter.  
 * Note: If system config has turned off this feature, $force will have no effect.
 * @param	string	the Url should be a relative link, have to improve this later on, to check if 
 *					it's a relative link, if not, truncate it.
 * @param	boolean	Available values are AT_PRETTY_URL_IS_HEADER, AT_PRETTY_URL_NOT_HEADER(default)
 *			use AT_PRETTY_URL_IS_HEADER if url_rewrite is used in php header('Location:..'), absolute path is needed for this.
 * @param	boolean	true to force the url_rewrite, false otheriwse.  False is the default.
 * @author	Harris Wong
 */
function url_rewrite($url, $is_rewriting_header = AT_PRETTY_URL_NOT_HEADER, $force = false)
{
    global $_config, $db;
    $url_parser = new UrlParser();
    $pathinfo = $url_parser->getPathArray();
    /* If this is any kind of admins, don't prettify the url
     * $_SESSION['is_guest'] is used to check against login/register/browse page, the links on this page will 
     * only be prettified when a user has logged in.
     * Had used $_SESSION[valid_user] before but it created this problem: 
     * http://www.atutor.ca/atutor/mantis/view.php?id=3426
     */
    if ($force || isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) {
        //if course id is defined, apply pretty url.
    } else {
        if (admin_authenticate(AT_ADMIN_PRIV_ADMIN, AT_PRIV_RETURN) || isset($_SESSION['privileges']) && admin_authenticate($_SESSION['privileges'], AT_PRIV_RETURN) || isset($_SESSION['is_guest']) && $_SESSION['is_guest'] == 1) {
            return $url;
        }
    }
    //if we allow pretty url in the system
    if ($_config['pretty_url'] > 0) {
        $course_id = 0;
        //If we allow course dir name from sys perf
        if ($_config['course_dir_name'] > 0) {
            if (preg_match('/bounce.php\\?course=([\\d]+)$/', $url, $matches) == 1) {
                // bounce has the highest priority, even if session is set, work on
                // bounce first.
                $course_id = $url_parser->getCourseDirName($matches[1]);
            } elseif (isset($_REQUEST['course'])) {
                //jump menu
                $course_id = $url_parser->getCourseDirName($_REQUEST['course']);
            } elseif (isset($_REQUEST['p_course'])) {
                // is set when guests access public course. @see bounce.php
                $course_id = $url_parser->getCourseDirName($_REQUEST['p_course']);
            } elseif (isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) {
                $course_id = $url_parser->getCourseDirName($_SESSION['course_id']);
            }
        } else {
            if (isset($_SESSION['course_id'])) {
                $course_id = $_SESSION['course_id'];
            }
        }
        $url = $pathinfo[1]->convertToPrettyUrl($course_id, $url);
    } elseif ($_config['course_dir_name'] > 0) {
        //enabled course directory name, disabled pretty url
        if (preg_match('/bounce.php\\?course=([\\d]+)$/', $url, $matches) == 1) {
            // bounce has the highest priority, even if session is set, work on
            // bounce first.
            $course_id = $url_parser->getCourseDirName($matches[1]);
        } elseif (isset($_REQUEST['course'])) {
            $course_id = $url_parser->getCourseDirName($_REQUEST['course']);
        } elseif (isset($_REQUEST['p_course'])) {
            // is set when guests access public course. @see bounce.php
            $course_id = $url_parser->getCourseDirName($_REQUEST['p_course']);
        } elseif (isset($_SESSION['course_id']) && $_SESSION['course_id'] > 0) {
            $course_id = $url_parser->getCourseDirName($_SESSION['course_id']);
        }
        $url = $pathinfo[1]->convertToPrettyUrl($course_id, $url);
    }
    //instead of putting AT_BASE_HREF in all the headers location, we will put it here.
    //Abs paths are required for pretty url because otherwise the url location will be appeneded.
    //ie.	ATutor_161/blogs/CoURSe_rOAd/blogs/view.php/ot/1/oid/1/ instead of
    //		ATutor_161/CoURSe_rOAd/blogs/view.php/ot/1/oid/1/
    if ($is_rewriting_header == true) {
        return AT_BASE_HREF . $url;
    }
    return $url;
}
Пример #28
0
    /**
     * Takes page summaries for RSS pages and the current query
     * and draws list of news links and a link to the news link subsearch
     * page if applicable.
     *
     * @param array $feed_pages page data from news feeds
     * @param string $base_query the  query_string prefix
     * @param string $query the current search query
     * @param string $subsearch name of subsearch page this image group on
     * @param boolean $open_in_tabs whether new links should be opened in
     *    tabs
     */
    function render($feed_pages, $base_query, $query, $subsearch, $open_in_tabs = false)
    {
        if ($subsearch != 'news') {
            $not_news = true;
            ?>
            <h2><a href="<?php 
            e("{$base_query}&amp;q={$query}&amp;s=news");
            ?>
"
                ><?php 
            e(tl('feeds_helper_view_feed_results', $query));
            ?>
</a></h2>
        <?php 
        } else {
            $not_news = false;
        }
        ?>
            <div class="feed-list">
        <?php 
        $time = time();
        foreach ($feed_pages as $page) {
            $pub_date = $page[self::SUMMARY_OFFSET][0][4];
            $encode_source = urlencode(urlencode($page[self::SOURCE_NAME]));
            if (isset($page[self::URL])) {
                if (strncmp($page[self::URL], "url|", 4) == 0) {
                    $url_parts = explode("|", $page[self::URL]);
                    $url = $url_parts[1];
                    $title = UrlParser::simplifyUrl($url, 60);
                    $subtitle = "title='" . $page[self::URL] . "'";
                } else {
                    $url = $page[self::URL];
                    $title = $page[self::TITLE];
                    if (strlen(trim($title)) == 0) {
                        $title = UrlParser::simplifyUrl($url, 60);
                    }
                    $subtitle = "";
                }
            } else {
                $url = "";
                $title = isset($page[self::TITLE]) ? $page[self::TITLE] : "";
                $subtitle = "";
            }
            $pub_date = $this->getPubdateString($time, $pub_date);
            if ($not_news) {
                ?>
                <div class="blockquote">
                <a href="<?php 
                e($page[self::URL]);
                ?>
" rel="nofollow" <?php 
                if ($open_in_tabs) {
                    ?>
 target="_blank" <?php 
                }
                ?>
><?php 
                e($page[self::TITLE]);
                ?>
</a>
                <a class="gray-link" rel='nofollow' href="<?php 
                e($base_query . "&amp;q=media:news:" . $encode_source . "&amp;s=news");
                ?>
" ><?php 
                e($page[self::SOURCE_NAME] . "</a>" . "<span class='gray'> - {$pub_date}</span>");
                ?>
</span>
                </div>
        <?php 
            } else {
                ?>
                <div class="results">
                <h2><a href="<?php 
                e($page[self::URL]);
                ?>
" rel="nofollow" <?php 
                if ($open_in_tabs) {
                    ?>
 target="_blank" <?php 
                }
                ?>
><?php 
                e($page[self::TITLE]);
                ?>
</a>.
                <a class="gray-link" rel='nofollow' href="<?php 
                e($base_query . "&amp;q=media:news:" . $encode_source . "&amp;s=news");
                ?>
" ><?php 
                e($page[self::SOURCE_NAME] . "</a>" . "<span class='gray'> - {$pub_date}</span>");
                ?>
</h2>
                <p class="echo-link" <?php 
                e($subtitle);
                ?>
><?php 
                e(UrlParser::simplifyUrl($url, 100) . " ");
                ?>
</p>
                <?php 
                $description = isset($page[self::DESCRIPTION]) ? $page[self::DESCRIPTION] : "";
                e("<p>{$description}</p>");
                ?>
                </div>
        <?php 
            }
        }
        ?>
        </div>
        <?php 
    }
Пример #29
0
 /**
  * Guess mime type based on extension of the file
  *
  * @param string $file_name name of the file
  * @return string $mime_type for the given file name
  */
 static function guessMimeTypeFromFileName($file_name)
 {
     $mime_type_map = array("bmp" => 'image/bmp', "doc" => 'application/msword', "epub" => 'application/epub+zip', "gif" => 'image/gif', "asp" => 'text/asp', "aspx" => 'text/asp', 'cgi' => 'text/html', "cfm" => 'text/html', "cfml" => 'text/html', "do" => 'text/html', "htm" => 'text/html', "html" => 'text/html', "jsp" => 'text/html', "php" => 'text/html', "pl" => 'text/html', "java" => 'text/java', "py" => 'text/py', "shtml" => 'text/html', "jpg" => 'image/jpeg', "jpeg" => 'image/jpeg', "pdf" => 'application/pdf', "png" => 'image/png', "ppt" => 'application/vnd.ms-powerpoint', "pptx" => 'application/vnd.openxmlformats-officedocument.' . 'presentationml.presentation', "rss" => 'application/rss+xml', "rtf" => 'text/rtf', "svg" => 'image/svg+xml', "csv" => 'text/csv', "tab" => 'text/tab-separated-values', "tsv" => 'text/x-java-source', "txt" => 'text/plain', "xlsx" => 'application/vnd.openxmlformats-officedocument.' . 'spreadsheetml.sheet', "xml" => 'text/gitxml', "js" => 'text/plain', "c" => 'text/plain', "cc" => 'text/plain', "cs" => 'text/plain');
     $extension = UrlParser::getDocumentType($file_name);
     if (isset($mime_type_map[$extension])) {
         $mime_type = $mime_type_map[$extension];
     } else {
         $mime_type = "text/plain";
     }
     return $mime_type;
 }
Пример #30
0
 /**
  * Gets the cached version of a web page from the machine on which it was
  * fetched.
  *
  * Complete cached versions of web pages typically only live on a fetcher
  * machine. The queue server machine typically only maintains summaries.
  * This method makes a REST request of a fetcher machine for a cached page
  * and get the results back.
  *
  * @param string $machine the ip address of domain name of the machine the
  *     cached page lives on
  * @param string $machine_uri the path from document root on $machine where
  *     the yioop scripts live
  * @param int $partition the partition in the WebArchiveBundle the page is
  *      in
  * @param int $offset the offset in bytes into the WebArchive partition in
  *     the WebArchiveBundle at which the cached page lives.
  * @param string $crawl_time the timestamp of the crawl the cache page is
  *     from
  * @param int $instance_num which fetcher instance for the particular
  *     fetcher crawled the page (if more than one), false otherwise
  * @return array page data of the cached page
  */
 function getCacheFile($machine, $machine_uri, $partition, $offset, $crawl_time, $instance_num = false)
 {
     $time = time();
     $session = md5($time . AUTH_KEY);
     if ($machine == '::1') {
         //IPv6 :(
         $machine = "[::1]";
         //used if the fetching and queue serving were on the same machine
     }
     // we assume all machines use the same scheme & port of the name server
     $port = UrlParser::getPort(NAME_SERVER);
     $scheme = UrlParser::getScheme(NAME_SERVER);
     $request = "{$scheme}://{$machine}:{$port}{$machine_uri}?c=archive&a=cache&" . "time={$time}&session={$session}&partition={$partition}&offset={$offset}" . "&crawl_time={$crawl_time}";
     if ($instance_num !== false) {
         $request .= "&instance_num={$instance_num}";
     }
     $tmp = FetchUrl::getPage($request);
     $page = @unserialize(base64_decode($tmp));
     $page['REQUEST'] = $request;
     return $page;
 }