private static function addByAttribute($img, $attribute, \DOMElement $html, Bag $bag, $domain = null) { $src = $img->hasAttribute($attribute); if ($src) { $src = new Url($img->getAttribute($attribute)); //Is src relative? if (!$src->getDomain()) { $bag->add('images', ['url' => $src->getUrl(), 'alt' => self::_getAltTag($img), 'href' => self::extractA($img, $bag->get('request_url'))]); return; } //Avoid external images or in external links if ($domain !== null) { if (!preg_match('~(ebayimg)~i', $src->getDomain())) { return; } $bag->add('images', ['url' => $src->getUrl(), 'alt' => self::_getAltTag($img), 'href' => self::extractA($img, $bag->get('request_url'))]); } } }
/** * Extract <img> elements * * @param \DOMElement $html * @param Bag $bag * @param null|string $domain */ protected static function extractImages(\DOMElement $html, Bag $bag, $domain = null) { foreach ($html->getElementsByTagName('img') as $img) { if ($img->hasAttribute('src')) { $src = new Url($img->getAttribute('src')); //Is src relative? if (!$src->getDomain()) { $bag->add('images', ['url' => $src->getUrl(), 'alt' => $img->hasAttribute('alt') ? $img->getAttribute('alt') : '', 'href' => self::extractA($img, $bag->get('request_url'))]); continue; } //Avoid external images or in external links if ($domain !== null) { if ($src->getDomain() !== $domain) { continue; } $parent = $img->parentNode; while ($parent && isset($parent->tagName)) { if ($parent->tagName === 'a') { if ($parent->hasAttribute('href')) { $ahref = $parent->getAttribute('href'); $href = new Url($ahref); //slow, very slow // if(in_array(strtolower(pathinfo($ahref, PATHINFO_EXTENSION)), ['jpg', 'jpeg', 'png', 'gif'])) { // $src = new Url($ahref); // continue; // } if ($href->getDomain() && $src->getDomain() !== $domain) { continue 2; } } if ($parent->hasAttribute('rel') && (string) $parent->getAttribute('rel') === 'nofollow') { continue 2; } break; } $parent = $parent->parentNode; } $bag->add('images', ['url' => $src->getUrl(), 'alt' => $img->hasAttribute('alt') ? $img->getAttribute('alt') : '', 'href' => self::extractA($img, $bag->get('request_url'))]); } } } }
/** * Extract <img> elements * * @param \DOMElement $html * @param Bag $bag * @param null|string $domain */ protected static function extractImages(\DOMElement $html, Bag $bag, $domain = null) { foreach ($html->getElementsByTagName('img') as $img) { if ($img->hasAttribute('src')) { $src = new Url($img->getAttribute('src')); //Is src relative? if (!$src->getDomain()) { $bag->add('images', $src->getUrl()); continue; } //Avoid external images or in external links if ($domain !== null) { if ($src->getDomain() !== $domain) { continue; } $parent = $img->parentNode; while ($parent && isset($parent->tagName)) { if ($parent->tagName === 'a') { if ($parent->hasAttribute('href')) { $href = new Url($parent->getAttribute('href')); if ($href->getDomain() && $src->getDomain() !== $domain) { continue 2; } } if ($parent->hasAttribute('rel') && (string) $parent->getAttribute('rel') === 'nofollow') { continue 2; } break; } $parent = $parent->parentNode; } $bag->add('images', $src->getUrl()); } } } }
/** * Extract information from the <meta> elements. * * @param \DOMDocument $html * @param Bag $bag */ protected static function extractFromMeta(\DOMDocument $html, Bag $bag) { foreach (Utils::getMetas($html) as $meta) { list($name, $value, $element) = $meta; if (!$value) { continue; } if ($name) { $name = strtolower($name); switch ($name) { case 'msapplication-tileimage': $bag->add('icons', $value); continue 2; default: $bag->set($name, $value); continue 2; } } if ($element->hasAttribute('itemprop')) { $bag->set($element->getAttribute('itemprop'), $value); } if ($element->hasAttribute('http-equiv')) { $bag->set($element->getAttribute('http-equiv'), $value); } } }