/** * Adds a rule to the list of rules that decide in what kind of documents the crawler * should search for links in (regarding their content-type) * * By default the crawler ONLY searches for links in documents of type "text/html". * Use this method to add one or more other content-types the crawler should check for links. * * Example: * <code> * $crawler->addLinkSearchContentType("#text/css# i"); * $crawler->addLinkSearchContentType("#text/xml# i"); * </code> * These rules let the crawler search for links in HTML-, CSS- ans XML-documents. * * <b>Please note:</b> It is NOT recommended to let the crawler checkfor links in EVERY document- * type! This could slow down the crawling-process dramatically (e.g. if the crawler receives large * binary-files like images and tries to find links in them). * * @param string $regex Regular-expression defining the rule * @return bool TRUE if the rule was successfully added */ function addLinkSearchContentType($regex) { $this->initCrawler(); $check = PHPCrawlerUtils::checkExpressionPattern($regex); // Check pattern if ($check == true) { $this->pageRequest->linksearch_content_types[] = trim($regex); } return $check; }
function addBasicAuthentication($expression, $username, $password) { $this->initCrawler(); $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern if ($check == true) { $c = count($this->pageRequest->basic_authentications); $this->pageRequest->basic_authentications[$c]["match"] = $expression; $this->pageRequest->basic_authentications[$c]["username"] = $username; $this->pageRequest->basic_authentications[$c]["password"] = $password; return true; } else { return false; } }