PHP PHPCrawl\Utils PHPCrawlerUtils::getMetaTagAttributes Beispiele

Programmiersprache: PHP

Namespace / Paketname: PHPCrawl\Utils

Klasse / Typ: PHPCrawlerUtils

Methode / Funktion: getMetaTagAttributes

Beispiele auf hotexamples.com: 1

PHP PHPCrawl\Utils PHPCrawlerUtils::getMetaTagAttributes - 1 Beispiele gefunden. Dies sind die am besten bewerteten PHP Beispiele für die PHPCrawl\Utils\PHPCrawlerUtils::getMetaTagAttributes, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

splitURL(9)

checkRegexPattern(4)

normalizeURL(2)

getHeaderValue(2)

getSystemTempDir(1)

sort2dArray(1)

serializeToFile(1)

rmDir(1)

isValidUrlString(1)

getURIContent(1)

buildURLFromLink(1)

getRootUrl(1)

buildURLFromParts(1)

getMetaTagAttributes(1)

getHTTPStatusCode(1)

getCookiesFromHeader(1)

getBaseUrlFromMetaTag(1)

deserializeFromFile(1)

checkStringAgainstRegexArray(1)

getRedirectURLFromHeader(1)

Beispiel #1

Datei anzeigen

Datei: PHPCrawlerLinkFinder.php Projekt: dawid-z/phpcrawl

 /**
  * Searches for links in the given HTML-chunk and adds found links the the internal link-cache.
  */
 public function findLinksInHTMLChunk(&$html_source)
 {
     PHPCrawlerBenchmark::start("searching_for_links_in_page");
     // Check for meta-base-URL and meta-tags in top of HTML-source
     if ($this->top_lines_processed == false) {
         $meta_base_url = PHPCrawlerUtils::getBaseUrlFromMetaTag($html_source);
         if ($meta_base_url != null) {
             $base_url = PHPCrawlerUtils::buildURLFromLink($meta_base_url, $this->baseUrlParts);
             $this->baseUrlParts = PHPCrawlerUrlPartsDescriptor::fromURL($base_url);
         }
         // Get all meta-tags
         $this->meta_attributes = PHPCrawlerUtils::getMetaTagAttributes($html_source);
         // Set flag that top-lines of source were processed
         $this->top_lines_processed = true;
     }
     // Prepare HTML-chunk
     $this->prepareHTMLChunk($html_source);
     // Build the RegEx-part for html-tags to search links in
     $tag_regex_part = "";
     $cnt = count($this->extract_tags);
     for ($x = 0; $x < $cnt; $x++) {
         $tag_regex_part .= "|" . $this->extract_tags[$x];
     }
     $tag_regex_part = substr($tag_regex_part, 1);
     // 1. <a href="...">LINKTEXT</a> (well formed link with </a> at the end and quotes around the link)
     // Get the link AND the linktext from these tags
     // This has to be done FIRST !!
     preg_match_all("#<\\s*a\\s[^<>]*(?<=\\s)(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*>" . "((?:(?!<\\s*\\/a\\s*>).){0,500})" . "<\\s*\\/a\\s*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = $matches[2][$x];
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Second regex (everything that could be a link inside of <>-tags)
     preg_match_all("#<[^<>]*\\s(?:" . $tag_regex_part . ")\\s*=\\s*" . "(?|\"([^\"]+)\"|'([^']+)'|([^\\s><'\"]+))[^<>]*># is", $html_source, $matches);
     $cnt = count($matches[0]);
     for ($x = 0; $x < $cnt; $x++) {
         $link_raw = trim($matches[1][$x]);
         $linktext = "";
         $linkcode = trim($matches[0][$x]);
         if (!empty($link_raw)) {
             $this->addLinkToCache($link_raw, $linkcode, $linktext);
         }
     }
     // Now, if agressive_mode is set to true, we look for some
     // other things
     $pregs = array();
     if ($this->aggressive_search == true) {
         // Links like "...:url("animage.gif")..."
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*\\(\\s*([\"|']{0,1})([^\"'\\) ]{1,500})['\"\\)]/ is";
         // Everything like "...href="bla.html"..." with qoutes
         $pregs[] = "/[\\s\\.:;\"'](?:" . $tag_regex_part . ")\\s*=\\s*([\"|'])(.{0,500}?)\\1/ is";
         // Everything like "...href=bla.html..." without qoutes
         $pregs[] = "/[\\s\\.:;](?:" . $tag_regex_part . ")\\s*(=)\\s*([^\\s\">']{1,500})/ is";
         for ($x = 0; $x < count($pregs); $x++) {
             unset($matches);
             preg_match_all($pregs[$x], $html_source, $matches);
             $cnt = count($matches[0]);
             for ($y = 0; $y < $cnt; $y++) {
                 $link_raw = trim($matches[2][$y]);
                 $linkcode = trim($matches[0][$y]);
                 $linktext = "";
                 $this->addLinkToCache($link_raw, $linkcode, $linktext);
             }
         }
     }
     $this->found_links_map = array();
     PHPCrawlerBenchmark::stop("searching_for_links_in_page");
 }