The libxml errors are disabled when the content is parsed.
If you want to get parsing errors, be sure to enable
internal errors via libxml_use_internal_errors(true)
and then, get the errors via libxml_get_errors(). Be
sure to clear errors with libxml_clear_errors() afterward.
public addHtmlContent ( string $content, string $charset = 'UTF-8' ) | ||
$content | string | The HTML content |
$charset | string | The charset |
/** * Process the DOM * * @return array * @throws Exception */ public function process() { // Check if HTML content is already set $this->checkIfContentIsEmpty($this->html); $items = []; $total = 0; $prepareItems = function (Crawler $nodeCrawler, $i) use(&$items, &$total) { $title = $nodeCrawler->filter('h3 > a'); $link = $nodeCrawler->filter('h3 > a')->attr('href'); $price = $nodeCrawler->filter('p.pricePerUnit')->text(); $descriptionPage = $this->fetch($link); //prepare items array $items[$i]['title'] = trim($title->text()); $items[$i]['size'] = $this->sizeOf($descriptionPage); $items[$i]['unit_price'] = $this->format($price); $items[$i]['description'] = $this->getDescriptionFor($descriptionPage); $total += $items[$i]['unit_price']; }; // bind the closure to the object context // so we can access the object inside the closure $prepareItems->bindTo($this); $this->domCrawler->addHtmlContent($this->html); $this->domCrawler->filter('ul.productLister > li')->each($prepareItems); $this->items = $items; $this->total = number_format($total, 2); unset($items); unset($total); return ['items' => $this->items, 'total' => $this->total]; }
public function setUp() { $this->selectorProvider = new SelectorProvider(); $this->crawler = new Crawler(); $this->crawler->addHtmlContent($this->getValidHtml()); $this->itemsCssSelector = '.list-group .list-group-item'; $this->noItemsCssSelector = '.not-existing-class'; }
public function setUp() { $this->listSelector = new Css(".list-group .list-group-item"); $this->emptyResultSelector = new Css(".non-existing"); $this->emptySelector = new Css(""); $this->crawler = new Crawler(); $this->crawler->addHtmlContent($this->getValidHtml()); }
/** * @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent */ public function testAddHtmlContent() { $crawler = new Crawler(); $crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string'); $crawler->addHtmlContent('<html><head><base href="http://symfony.com"></head><a href="/contact"></a></html>', 'UTF-8'); $this->assertEquals('http://symfony.com', $crawler->filter('base')->attr('href'), '->addHtmlContent() adds nodes from an HTML string'); $this->assertEquals('http://symfony.com/contact', $crawler->filter('a')->link()->getUri(), '->addHtmlContent() adds nodes from an HTML string'); }
public function setUp() { $this->validCorrectPattern = '/user-(?P<value>\\d+)/'; $this->validNoMatchPattern = '/NO-MATCH_STRING-(?P<value>\\d+)/'; $this->validPatternWrongParam = '/NO-MATCH_STRING-(?P<wrong>\\d+)/'; $this->invalidPattern = '/$%#$>\\d+)))/'; $this->selectorProvider = new SelectorProvider(); $this->crawler = new Crawler(); $this->crawler->addHtmlContent($this->getValidHtml()); $this->itemCssSelector = '.list-group .list-group-item'; //will select first $this->noItemsCssSelector = '.not-existing-class'; }
/** * @param string $url * @param array $tags * * @return WatchLink */ public function extract(string $url, array $tags) : WatchLink { $watchLink = new WatchLink(); $watchLink->setUrl($url); $this->crawler->clear(); $this->crawler->addHtmlContent($this->fetcher->fetch($url)); $watchLink->setName($this->extractTitle()); $watchLink->setDescription($this->extractDescription()); $watchLink->setImage($this->extractImage()); foreach ($tags as $tag) { $watchLink->addTag($this->tagRepository->findOrCreate($tag)); } return $watchLink; }
/** * Filter the price present on each countries price page, and return the price * * @param string $content * * @return string $price */ public function filterPrice($content) { $crawler = new Crawler(); $crawler->addHtmlContent($content); $price = $crawler->filterXPath("html/body/div[1]/div[3]/div/div/div[3]/div[4]/div/table/tr[1]/td[2]")->extract('_text', 'td'); return trim($price[0]); }
public function transform($category_page_url, $pretty_print_json = false) { $crawler = new Crawler(); /** loads the initial category page into a Crawler */ $crawler->addHtmlContent($this->page_manager->getPage($category_page_url), 'ISO-8859-1'); $category_page = new CategoryPage($crawler); $product_collection = new ProductCollection(); /** loops through all the products on the category page */ /** @todo handle cases where HTML structure throws out the crawler more elegantly */ $category_page->getProducts()->each(function (Crawler $category_page_product_node, $i) use($product_collection) { try { $product_node = new CategoryPageProductNode($category_page_product_node); $url_of_product_page = $product_node->getProductHref(); $crawler = new Crawler(); /** loads the product page */ $crawler->addHtmlContent($this->page_manager->getPage($url_of_product_page), 'ISO-8859-1'); $product_page = new ProductPage($crawler); $product = new Product(); /** gets the content from either the product or category page and saves it in the product entity */ $product->setTitle($product_node->getTitle())->setDescription($product_page->getDescription())->setUnitPrice($product_node->getUnitPrice())->setSize($this->page_manager->getSizeOfPage($url_of_product_page)); $product_collection->addProduct($product); } catch (\InvalidArgumentException $ex) { } }); /** Combines the results with the total of all the unit prices */ return json_encode(['results' => $product_collection->toArray(), 'total' => $product_collection->getSumOfUnitPrices() / 100], $pretty_print_json ? JSON_PRETTY_PRINT : 0); }
/** * simulate worldjournal ajax call to fetch content data */ public function actionTrypostdata() { $hostname = 'www.wjlife.com'; $optionVaules = ["relation" => "AND", "0" => ["relation" => "AND", "0" => ["key" => "wj_order_id"]]]; //all help wanted $currentURL = "/cls_category/03-ny-help-wanted/"; //temp page number $pno = 0; $queryObject = ["keyword" => "", "pagesize" => 40, "pno" => $pno, "optionVaules" => $optionVaules, "currentURL" => "http://" . $hostname . $currentURL, "currentCatId" => 327, "currentStateId" => 152]; //language: chinese simplified $wjlang = "zh-cn"; $requestUrl = "http://" . $hostname . "/wp-content/themes/wjlife/includes/classified-core.php?regions=state_ny&variant=" . $wjlang . "&t=" . time(); // echo "start...\n"; $client = new Client(); $crawler = $client->request("POST", $requestUrl, $queryObject, [], ['HTTP_X-Requested-With' => 'XMLHttpRequest', 'contentType' => 'application/x-www-form-urlencoded;charset=utf-8']); $rowHtml = $crawler->html(); // if you want to echo out with correct encoding, do `echo utf8_decode($rowHtml)` // echo utf8_decode($rowHtml); // echo "end...\n"; $subCrawler = new Crawler(); $subCrawler->addHtmlContent($rowHtml); $linkArray = $subCrawler->filter(".catDesc a")->each(function ($node, $index) { return $href = $node->attr('href'); }); print_r($linkArray); }
public function setUp() { $html = file_get_contents(__DIR__ . '/../Fixtures/category-page-product-node.html'); $crawler = new Crawler(); $crawler->addHtmlContent($html, 'ISO-8859-1'); $this->SUT = new SUT($crawler); }
/** * Replace all src of img.inline-image with an embedded image * * @param Swift_Message $message */ protected function inlineImages(Swift_Message $message) { $html = $message->getBody(); $crawler = new Crawler(); $crawler->addHtmlContent($html); $imgs = array(); $replaces = array(); foreach ($crawler->filterXPath("//img[contains(concat(' ',normalize-space(@class), ' '), ' inline-image ')]") as $img) { $normalized_src = $src = $img->getAttribute('src'); if (isset($replaces['src="' . $src . '"'])) { continue; } // if starting with one slash, use local file if (preg_match('#^/[^/]#', $normalized_src)) { $normalized_src = $this->web_directory . parse_url($src, PHP_URL_PATH); } if (!isset($imgs[$normalized_src])) { $swift_image = Swift_Image::fromPath($normalized_src); $imgs[$normalized_src] = $message->embed($swift_image); } $replaces['src=\'' . $src . '\''] = 'src="' . $imgs[$normalized_src] . '"'; $replaces['src="' . $src . '"'] = 'src="' . $imgs[$normalized_src] . '"'; } if (count($replaces)) { $html = str_replace(array_keys($replaces), array_values($replaces), $html); $message->setBody($html); } }
public function setLaundryState(&$laundryPlace) { $user = '******'; $pass = '******'; try { $client = new Client($laundryPlace['url']); $request = $client->get('/LaundryState', [], ['auth' => [$user, $pass, 'Digest'], 'timeout' => 1.5, 'connect_timeout' => 1.5]); $response = $request->send(); $body = $response->getBody(); libxml_use_internal_errors(true); $crawler = new Crawler(); $crawler->addContent($body); foreach ($crawler->filter('img') as $img) { $resource = $img->getAttribute('src'); $img->setAttribute('src', 'http://129.241.126.11/' . trim($resource, '/')); } $crawler->addHtmlContent('<h1>foobar</h1>'); //'<link href="http://129.241.126.11/pic/public_n.css" type="text/css">'); $laundryPlace['html'] = $crawler->html(); libxml_use_internal_errors(false); preg_match_all('/bgColor=Green/', $body, $greenMatches); preg_match_all('/bgColor=Red/', $body, $redMatches); $laundryPlace['busy'] = count($redMatches[0]); $laundryPlace['available'] = count($greenMatches[0]); } catch (\Exception $e) { $laundryPlace['available'] = self::NETWORK_ERROR; $laundryPlace['busy'] = self::NETWORK_ERROR; $laundryPlace['html'] = self::NETWORK_ERROR; } }
/** * @covers Symfony\Component\DomCrawler\Crawler::addHtmlContent */ public function testAddHtmlContent() { $crawler = new Crawler(); $crawler->addHtmlContent('<html><div class="foo"></html>', 'UTF-8'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addHtmlContent() adds nodes from an HTML string'); }
public function test_it_extracts_description() { $html = file_get_contents(__DIR__ . '/../Fixtures/product-page.html'); $crawler = new Crawler(); $crawler->addHtmlContent($html, 'ISO-8859-1'); $SUT = new SUT($crawler); $this->assertEquals("Apricots", $SUT->getDescription()); }
/** * Scraps og:title off the page content * @param string $url * @return string */ public function scrap($url) { $title = 'Unable to parse'; $this->request->setMethod(HTTP_METH_GET); $this->request->setUrl($url); try { $response = $this->request->send(); $this->crawler->addHtmlContent($response->getBody()); $subCrawler = $this->crawler->filterXPath('//head/meta[@property="og:title"]'); $meta = $subCrawler->getNode(0); if ($meta) { $title = $meta->getAttribute('content'); } } catch (Exception $e) { $title = $e->getMessage(); } return $title; }
public function getListFromMedia() { $crawler = new Crawler(); $crawler->addHtmlContent(@file_get_contents($this->source->url)); $getterClassName = 'App\\PostCrawlers\\PostLists\\' . $this->source->media_parent . 'ListGetter'; // example, NowLebanonListGetter, which implements now lebanon's way of getting lists; $getter = new $getterClassName($this->source->url, $crawler); return $getter->getList(); }
public function getDetailsFromMedia() { $crawler = new Crawler(); $htmlContent = @file_get_contents($this->url); $crawler->addHtmlContent($htmlContent); $getterClassName = 'App\\PostCrawlers\\PostDetails\\' . $this->source->media_parent . 'DetailsGetter'; $getter = new $getterClassName($this->url, $crawler); return $getter->getDetails(); }
/** * assertSelectEquals("#binder .name", "Chuck", true, $xml); // any? * assertSelectEquals("#binder .name", "Chuck", false, $xml); // none? * * @param array $selector * @param string $content * @param integer|boolean|array $count * @param mixed $actual * @param string $message * @param boolean $isHtml * @since Method available since Release 1.0.0 * * @throws PHPUnit_Framework_Exception */ public static function assertSelectEquals($selector, $content, $count, $actual, $message = '', $isHtml = true) { $crawler = new Crawler(); if ($actual instanceof DOMDocument) { $crawler->addDocument($actual); } else { if ($isHtml) { $crawler->addHtmlContent($actual); } else { $crawler->addXmlContent($actual); } } $crawler = $crawler->filter($selector); if (is_string($content)) { $crawler = $crawler->reduce(function (Crawler $node, $i) use($content) { if ($content === '') { return $node->text() === ''; } if (preg_match('/^regexp\\s*:\\s*(.*)/i', $content, $matches)) { return (bool) preg_match($matches[1], $node->text()); } return strstr($node->text(), $content) !== false; }); } $found = count($crawler); if (is_numeric($count)) { self::assertEquals($count, $found, $message); } else { if (is_bool($count)) { $found = $found > 0; if ($count) { self::assertTrue($found, $message); } else { self::assertFalse($found, $message); } } else { if (is_array($count) && (isset($count['>']) || isset($count['<']) || isset($count['>=']) || isset($count['<=']))) { if (isset($count['>'])) { self::assertTrue($found > $count['>'], $message); } if (isset($count['>='])) { self::assertTrue($found >= $count['>='], $message); } if (isset($count['<'])) { self::assertTrue($found < $count['<'], $message); } if (isset($count['<='])) { self::assertTrue($found <= $count['<='], $message); } } else { throw new PHPUnit_Framework_Exception('Invalid count format'); } } } }
public function parseForm($data) { $doc = new Crawler(); $doc->addHtmlContent($data); $ret = []; foreach ($doc->filter('input[type="hidden"]') as $node) { /** @var \DOMElement $node */ $ret[$node->getAttribute('name')] = $node->getAttribute('value'); } return $ret; }
/** * {@inheritDoc} */ public function crawl($html) { $crawler = new Crawler(); $crawler->clear(); $crawler->addHtmlContent($html); $movieInfo = $crawler->filter('#overview-top')->each(function (Crawler $domCrawler) { $title = $domCrawler->filter('.header span')->first()->text(); $year = $domCrawler->filter('.header span')->last()->text(); return ['title' => $title, 'year' => $year, 'rating' => $domCrawler->filter('.star-box .giga-star ')->first()->text(), 'desc' => $domCrawler->filter('p.description')->text()]; }); }
/** * Execute the console command. * * @return mixed */ public function handle(Client $client) { Crawler::create(['url' => $this->argument('url')]); return; Crawler::where('url', '=', $this->argument('url'))->delete(); $crawler = Crawler::create(['url' => $this->argument('url')]); $html = $client->get($this->argument('url'))->getBody(); $dom = new DomCrawler(); $dom->addHtmlContent($html); $spider = new Spider($crawler, $dom); $spider->get(); }
public function createAccountAndShop(array $options, $onlyShop = false) { $options = array_merge(['waitForSubdomain' => true], $options); if ($onlyShop) { $this->browser->visit($this->homePage->getNewStoreURL())->fillIn('#create-online-store-shop_name', $options['shop_name'])->click('a.get-me-started'); $confPage = new StoreConfigurationPage($this->homePage); $confPage->chooseCountry($options['country'])->chooseFirstQualification()->submit()->fillPassword($options['password'])->fillPasswordConfirmation($options['password'])->acceptTandC()->submit(); $this->browser->click('a.get-me-started'); } else { $this->homePage->visit()->setLanguage($options['language'])->submitShopCreationBannerForm($options['shop_name'], $options['email'])->chooseCountry($options['country'])->chooseFirstQualification()->submit()->fillFirstname('Jøħn')->fillLastname('Sölünëum')->fillPassword($options['password'])->fillPasswordConfirmation($options['password'])->acceptTandC()->submit(); $waitForEmail = new Spinner('Could not find activation email.', 300); $reader = new GmailReader($this->homePage->getSecrets()['customer']['email'], $this->homePage->getSecrets()['customer']['gmail_password']); $expectedActivationEmailButtonTitle = static::$expectedActivationEmailButtonTitle[$options['language']]; $activationLink = null; /** * @todo : do we want to test the order in which the emails are received? */ try { $waitForEmail->assertBecomesTrue(function () use($reader, $options, $expectedActivationEmailButtonTitle, &$activationLink) { $emails = $reader->readEmails($options['email']); foreach ($emails as $email) { $crawler = new Crawler('', 'http://www.example.com'); $crawler->addHtmlContent($email['body']); $crawler = $crawler->selectLink($expectedActivationEmailButtonTitle); if ($crawler->count() > 0) { $activationLink = $crawler->link()->getUri(); return true; } } return false; }, false); } catch (\Exception $e) { throw new FailedTestException($e->getMessage()); } $this->browser->visit($activationLink); } $myStores = new MyStoresPage($this->homePage); $frontOfficeURL = $myStores->getFrontOfficeURL($options['shop_name']); $backOfficeURL = $myStores->getBackOfficeURL($options['shop_name']); if ($options['waitForSubdomain']) { $this->waitFor200($frontOfficeURL); sleep(300); // wait 5 minutes for the host to be ready } $shopSettings = ['front_office_url' => $frontOfficeURL, 'back_office_url' => $backOfficeURL, 'back_office_folder_name' => 'backoffice', 'prestashop_version' => '1.6.0.10']; $shop = new Shop($shopSettings, null); $shop->setBrowser($this->browser); $optionProvider = new OptionProvider(); $optionProvider->setDefaultValues(['BackOfficeLogin' => ['admin_email' => $options['email'], 'admin_password' => $options['password']]]); $shop->setOptionProvider($optionProvider); return ['shop' => $shop, 'myStoresPage' => $myStores]; }
/** * @return array|bool */ public function parseAll() { $this->crawler->addHtmlContent($this->getHtml($this->id), 'ISO-8859-1'); $nodeValues = $this->crawler->filter('table.dataArray tbody tr td')->each(function (Crawler $node) { return [$node->attr('headers') => trim($node->text())]; }); if ($nodeValues) { $rows = array_chunk($nodeValues, 3); foreach ($rows as $key => $value) { $this->dates[$key] = $value[0]; $this->labels[$key] = $value[1]; $this->sites[$key] = $value[2]; } foreach ($this->dates as $key => $value) { $this->data['status'][$key] = ['date' => $this->dates[$key]['Date'], 'label' => $this->labels[$key]['Libelle'], 'location' => $this->sites[$key]['site']]; } $this->data['id'] = $this->id; $this->data['destination'] = $this->parseDestination(); return $this->data; } return false; }
/** * @When /^I click the ([^"]*) link in the e-?mail$/ */ public function iClickTheLink($linkText) { if (empty($this->email)) { throw new \Exception('No email to click through from.'); } $crawler = new Crawler(); $crawler->addHtmlContent($this->email['htmlContent']['htmlBody']); try { $href = $crawler->selectLink($linkText)->attr('href'); } catch (\InvalidArgumentException $e) { throw new \Exception("No link with text '{$linkText}' found in email."); } $this->getSession()->visit($href); }
/** * @param $html * @return array */ public function load($html) { $metaTags = []; $this->crawler->clear(); $this->crawler->addHtmlContent($html); $this->crawler->filter('meta')->each(function (Crawler $node) { $name = strtolower($node->attr('name')); $content = $node->attr('content'); $metaTags[$name] = $content; }); $links = []; $this->crawler->filter('a')->each(function (Crawler $link) use(&$links) { $rel = $link->attr('rel'); if ('nofollow' === strtolower($rel)) { return false; } $links[] = $link->attr('href'); return $link; }); $this->links = array_unique($links); $this->metaTags = $metaTags; return ['links' => $this->links, 'meta' => $metaTags]; }
public static function searchFor($search) { $rawData = Request::get('prothom-alo', $search); $rawData = json_decode($rawData); $document = '<html><head><meta charset="UTF-8"></head><body>' . $rawData->html . '</body></html>'; $crawler = new Crawler(); $crawler->addHtmlContent($document, 'UTF-8'); $items = $crawler->filter('body > div.search_reslut > div.search_item > h2 > a'); $results = []; foreach ($items as $subCrawler) { $subCrawler->setAttribute('href', 'http://www.prothom-alo.com' . $subCrawler->getAttribute('href')); $results[] = $subCrawler->ownerDocument->saveXML($subCrawler); } return $results; }
/** * returns html value of rdfa property. * * @param string $html content to crawl * @param StructureInterface $content * @param string $property could be a property sequence like (block,1,title,0) * * @return bool */ public function getPropertyValue($html, StructureInterface $content, $property) { // extract special property $crawler = new Crawler(); $crawler->addHtmlContent($html, 'UTF-8'); $nodes = $crawler; $before = ''; if (false !== ($sequence = $this->getSequence($content, $property))) { foreach ($sequence['sequence'] as $item) { // is not integer if (!ctype_digit(strval($item))) { $before = $item; $nodes = $nodes->filter('*[property="' . $item . '"]'); } else { $nodes = $nodes->filter('*[rel="' . $before . '"]')->eq($item); } } } else { // FIXME it is a bit complex but there is no :not operator in crawler // should be *[property="block"]:not(*[property] *) $nodes = $nodes->filter('*[property="' . $property . '"]')->reduce(function (Crawler $node) { // get parents $parents = $node->parents(); $count = 0; // check if one parent is property exclude it $parents->each(function ($node) use(&$count) { if (null !== $node->attr('property') && $node->attr('typeof') === 'collection') { ++$count; } }); return $count === 0; }); } // if rdfa property not found return false if ($nodes->count() > 0) { // create an array of changes return $nodes->each(function (Crawler $crawlerNode) { $node = $crawlerNode->getNode(0); $attributes = []; foreach ($node->attributes as $name => $value) { $attributes[$name] = $value->nodeValue; } $attributes['html'] = $crawlerNode->html(); return $attributes; }); } return false; }
public function replaceImages(Flyer $flyer) { $flyer_html = $flyer->getHtml(); $crawler = new Crawler(); $crawler->addHtmlContent($flyer_html); foreach ($crawler->filter('img') as $domElement) { $attr_src = $domElement->getAttribute('src'); if (!$this->validateURL($attr_src)) { $result = $this->utilsBusiness->upladImage($attr_src, 'flyers'); $domElement->setAttribute('src', $result['url']); } } $flyer->setHtml($crawler->html()); $this->saveData($flyer); return $flyer; }
private function getMessage(Response $response) { if (500 >= $response->getStatusCode() && $response->getStatusCode() < 600) { $crawler = new Crawler(); $crawler->addHtmlContent($response->getContent()); if ($crawler->filter('.text-exception h1')->count() > 0) { $exceptionMessage = trim($crawler->filter('.text-exception h1')->text()); $trace = ''; if ($crawler->filter('#traces-0 li')->count() > 0) { list($trace) = explode("\n", trim($crawler->filter('#traces-0 li')->text())); } return $message = 'Internal Server Error: ' . $exceptionMessage . ' ' . $trace; } } return $response->getContent(); }
protected function process($content) { $parser = new Parser('yaml', 'markdown'); $frontmatter = $parser->parse($content); $crawler = new Crawler(); $crawler->addHtmlContent($frontmatter->getBody()); $title = ''; $crawler->filterXPath('//h1')->each(function (Crawler $crawler) use(&$title) { foreach ($crawler as $node) { if (!$title) { $title = $node->nodeValue; } $node->parentNode->removeChild($node); } }); $images = $crawler->filterXPath('//img'); foreach ($images as $image) { $src = $image->getAttribute('src'); $image->setAttribute('src', str_replace('../..', '', $src)); } $body = $crawler->html(); $intro = explode('<hr>', $body)[0]; $body = str_replace('<hr>', '', $body); return ['title' => $title ?: '', 'keywords' => isset($frontmatter->head['keywords']) ? $frontmatter->head['keywords'] : '', 'description' => isset($frontmatter->head['description']) ? $frontmatter->head['description'] : mb_substr(strip_tags(trim($intro)), 0, 150), 'intro' => trim($intro), 'cover' => isset($frontmatter->head['cover']) ? $frontmatter->head['cover'] : '', 'content' => trim($body), 'tags' => isset($frontmatter->head['tags']) ? $frontmatter->head['tags'] : '']; }