/** * @param HtmlDocument $document * @return string */ protected function getImage(HtmlDocument $document) { $result = $document->getLinkHref('image_src'); if (strlen($result) > 0) { return $result; } foreach ($this->imgElements as $imgElement) { if (isset($imgElement['rel']) && $imgElement['rel'] == 'image_src') { $result = $imgElement['src']; return $result; } } return null; }
/** * Executes chain of parsers, passing them $document * * @param HtmlDocument $document */ public static function extractMetadata(HtmlDocument $document) { foreach (static::getParserChain($document->getUri()) as $parserClassName) { /** @var \Bitrix\Main\UrlPreview\Parser $parser */ if (class_exists($parserClassName)) { $parser = new $parserClassName(); if (is_a($parser, '\\Bitrix\\Main\\UrlPreview\\Parser')) { $parser->handle($document); } } if ($document->checkMetadata()) { break; } } }
/** * @param HtmlDocument $document * @return bool */ protected function detectOembedLink(HtmlDocument $document) { preg_match_all('/<link.+?alternate.+?>/', $document->getHtml(), $linkElements); foreach ($linkElements[0] as $linkElement) { $typeJson = strpos($linkElement, $this::OEMBED_TYPE_JSON) !== false; $typeXml = strpos($linkElement, $this::OEMBED_TYPE_XML) !== false; if ($typeJson || $typeXml) { if (preg_match('/href=[\'"](.+?)[\'"]/', $linkElement, $attributes)) { $this->metadataType = $typeJson ? 'json' : 'xml'; $this->metadataUrl = $attributes[1]; return true; } } } return false; }
/** * @param HtmlDocument $document * @return bool */ protected function initializeDom(HtmlDocument $document) { if (!class_exists('DOMDocument')) { return false; } $this->dom = new \DOMDocument(); // Prevents parsing errors bubbling libxml_use_internal_errors(true); $result = $this->dom->loadHTML('<?xml encoding="' . $document->getEncoding() . '">' . $document->getHtml()); return $result; }
/** * @param Uri $uri Absolute URL to get metadata for. * @return array|false */ protected static function getRemoteUrlMetadata(Uri $uri) { $httpClient = new HttpClient(); $httpClient->setTimeout(5); $httpClient->setStreamTimeout(5); if (!$httpClient->query('GET', $uri->getUri())) { return false; } if ($httpClient->getStatus() !== 200) { return false; } $htmlContentType = strtolower($httpClient->getHeaders()->getContentType()); if ($htmlContentType !== 'text/html') { return static::getFileMetadata($httpClient->getEffectiveUrl(), $httpClient->getHeaders()); } $html = $httpClient->getResult(); $htmlDocument = new HtmlDocument($html, $uri); $htmlDocument->setEncoding($httpClient->getCharset()); ParserChain::extractMetadata($htmlDocument); $metadata = $htmlDocument->getMetadata(); if (is_array($metadata) && static::validateRemoteMetadata($metadata)) { if (isset($metadata['IMAGE']) && static::getOptionSaveImages()) { $metadata['IMAGE_ID'] = static::saveImage($metadata['IMAGE']); unset($metadata['IMAGE']); } return $metadata; } return false; }
/** * Parses HTML documents OpenGraph metadata * * @param HtmlDocument $document HTML document to be parsed. * @return void */ public function handle(HtmlDocument $document) { if (strlen($document->getTitle()) == 0) { $ogTitle = $document->getMetaContent('og:title'); if (strlen($ogTitle) > 0) { $document->setTitle($ogTitle); } } if (strlen($document->getDescription()) == 0) { $ogDescription = $document->getMetaContent('og:description'); if (strlen($ogDescription) > 0) { $document->setDescription($ogDescription); } } if (strlen($document->getImage()) == 0) { $ogImage = $document->getMetaContent('og:image:secure_url') ?: $document->getMetaContent('og:image'); if (strlen($ogImage) > 0) { $document->setImage($ogImage); } } if (!$document->getExtraField('SITE_NAME')) { $ogSiteName = $document->getMetaContent('og:site_name'); if (strlen($ogSiteName) > 0) { $document->setExtraField('SITE_NAME', $ogSiteName); } } /* Not really opengraph property :), but it's placed in opengraph parser to prevent executing full parser chain just to get favicon */ if (!$document->getExtraField('FAVICON')) { if ($favicon = $document->getLinkHref('icon')) { $document->setExtraField('FAVICON', $favicon); } } }