/** * Adds a rule to the list of rules that decide what kind of documents should get * checked for links in (regarding their content-type) * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added */ public function addLinkSearchContentType($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->linksearch_content_types[] = trim($regex); } return $check; }
/** * Adds a regular expression togehter with a priority-level to the list of rules that decide what links should be prefered. * * Links/URLs that match an expression with a high priority-level will be followed before links with a lower level. * All links that don't match with any of the given rules will get the level 0 (lowest level) automatically. * * The level can be any positive integer. * * <b>Example:</b> * * Telling the crawler to follow links that contain the string "forum" before links that contain ".gif" before all other found links. * <code> * $crawler->addLinkPriority("/forum/", 10); * $cralwer->addLinkPriority("/\.gif/", 5); * </code> * * @param string $regex Regular expression definig the rule * @param int $level The priority-level * * @return bool TRUE if a valid preg-pattern is given as argument and was succsessfully added, otherwise it returns FALSE. * @section 10 Other settings */ function addLinkPriority($regex, $level) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true && preg_match("/^[0-9]*\$/", $level)) { $c = count($this->link_priority_array); $this->link_priority_array[$c]["match"] = trim($regex); $this->link_priority_array[$c]["level"] = trim($level); return true; } else { return false; } }
/** * Adds a basic-authentication (username and password) to the list of authentications that will be send * with requests. * * @param string $url_regex Regular expression defining the URL(s) the authentication should be send to. * @param string $username The username * @param string $password The password * * @return bool */ public function addBasicAuthentication($url_regex, $username, $password) { // Check regex $regex_okay = PHPCrawlerUtils::checkRegexPattern($url_regex); if ($regex_okay == true) { // Add authentication to basic_authentications-array $tmp = array(); $tmp["url_regex"] = $url_regex; $tmp["username"] = $username; $tmp["password"] = $password; $this->basic_authentications[] = $tmp; return true; } else { return false; } }
/** * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. */ public function addURLFilterRule($regex) { $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern if ($check == true) { $this->url_filter_rules[] = trim($regex); } return $check; }