public function call($uri, $method = 'get', $parameters = array(), $changeStack = true) { $browser = parent::call($uri, $method, $parameters, $changeStack); $crawler = new Crawler(); $crawler->add($browser->getResponse()->getContent()); return $crawler; }
/** * @inheritdoc */ protected function parse(Requests_Response $requests) { $crawler = new Crawler(); $crawler->addContent($requests->body); $r = $crawler->filterXPath('//*[@id="content"]/div/div[2]/div[1]/div[1]/ul/li'); $results = array(); /** @var DOMElement $el */ foreach ($r as $el) { $c = new Crawler(); $c->add($el); $tags = []; /** @var DOMElement $z */ foreach ($c->filter(".horizontal-separated-list li") as $z) { $tags[] = $z->textContent; } $result = new Result(); $result->setTitle(trim($c->filter(".details a")->text())); $result->setTags($tags); $relUrl = $c->filter(".details a")->attr("href"); $id = explode("--", explode("/", parse_url($relUrl)["path"])[2])[1]; $result->setId($this->getName() . "_" . intval($id)); $result->setUrl("http://www.anibis.ch/" . $relUrl); $result->setPrice($c->filter(".price")->text()); $result->setDescription($c->filter(".details .description")->text()); $results[] = $result; } return $results; }
/** * @covers Symfony\Component\DomCrawler\Crawler::add */ public function testAdd() { $crawler = new Crawler(); $crawler->add($this->createDomDocument()); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from a \DOMDocument'); $crawler = new Crawler(); $crawler->add($this->createNodeList()); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from a \DOMNodeList'); foreach ($this->createNodeList() as $node) { $list[] = $node; } $crawler = new Crawler(); $crawler->add($list); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from an array of nodes'); $crawler = new Crawler(); $crawler->add($this->createNodeList()->item(0)); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from an \DOMNode'); $crawler = new Crawler(); $crawler->add('<html><body>Foo</body></html>'); $this->assertEquals('Foo', $crawler->filter('body')->text(), '->add() adds nodes from a string'); }
/** * @inheritdoc */ protected function parse(Requests_Response $requests) { $crawler = new Crawler(); $crawler->addContent($requests->body); $r = $crawler->filter("#page > main > section > div > div.result-item-list article a > .box-row"); $results = array(); /** @var DOMElement $el */ foreach ($r as $el) { $c = new Crawler(); $c->add($el); $tags = []; /** @var DOMElement $z */ foreach ($c->filter(".box-row ul.box-row-item-attribute-list li") as $z) { if ($z->childNodes !== null && $z->childNodes->length >= 4) { $tags[] = $z->childNodes->item(1)->nodeValue . ": " . $z->childNodes->item(3)->nodeValue; } } $addressB = $c->filter(".item-title--street"); $address = $addressB->text() . " " . $addressB->siblings()->text(); $tags[] = "Adresse: " . $address; $result = new Result(); $result->setTags($tags); $result->setTitle(trim($c->filter("h2")->text())); if ($c->filter("item-description p")->valid()) { $result->setDescription($c->filter("item-description p")->text()); } $link = $el->parentNode->attributes->getNamedItem("href")->nodeValue; $result->setId($this->getName() . "_" . explode("/", $link)[2]); $result->setUrl("http://m.homegate.ch/" . $link); $results[] = $result; } return $results; }
/** * @param string $path * @return string */ function parse($path) { if (!file_exists($this->pathMails = $this->config['cacheDir'] . DIRECTORY_SEPARATOR . $this->siteHash . DIRECTORY_SEPARATOR . $path)) { mkdir($this->pathMails); } foreach ($this->getLinks() as $file => $url) { $readStream = fopen($url, 'r'); $writeStream = fopen($this->pathSiteHash . DIRECTORY_SEPARATOR . $file, 'w'); stream_set_blocking($readStream, 0); stream_set_blocking($writeStream, 0); $read = new \React\Stream\Stream($readStream, $this->loop); $write = new \React\Stream\Stream($writeStream, $this->loop); $read->on('end', function () use($file, &$files) { $path = $this->pathSiteHash . DIRECTORY_SEPARATOR . $file; $crawler = new Crawler(); $crawler->add(file_get_contents($path)); $arrLinks = $crawler->filter('a')->each(function (Crawler $nodeCrawler) { return [$nodeCrawler->filter('a')->attr('href')]; }); $validMails = []; foreach ($arrLinks as $k => $url) { if (filter_var($url[0], FILTER_VALIDATE_EMAIL)) { $validMails[] = $url[0]; } else { if (filter_var($m = str_replace('mailto:', '', $url[0]), FILTER_VALIDATE_EMAIL)) { $validMails[] = $m; } } } $mails = []; foreach ($validMails as $m) { array_push($mails, str_replace('mailto:', '', $m)); } file_put_contents($this->pathMails . DIRECTORY_SEPARATOR . $file, implode(PHP_EOL, $mails)); unset($files[$file]); }); $read->pipe($write); } // каждые $this->config['periodTime'] секунд выполнять какое-то действие $this->loop->addPeriodicTimer($this->config['periodTime'], function ($timer) use(&$files) { if (0 === count($files)) { $timer->cancel(); } echo PHP_EOL . "Passed {$this->config['periodTime']} sec. " . PHP_EOL; }); echo "This script will show the download status every {$this->config['periodTime']} seconds." . PHP_EOL; $this->loop->run(); return 'Dir of result in: ' . $this->config['cacheDir'] . DIRECTORY_SEPARATOR . $this->siteHash . DIRECTORY_SEPARATOR . $path; }
/** * @Route("/sport/{name}") * @Template() */ public function indexAction($name) { $html = "http://www.chemistwarehouse.com.au/search?searchtext=Banana%20Boat%20SPF%2050+%20Everyday%20100g%20Tube&searchmode=allwords"; $ch = curl_init($html); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec($ch); curl_close($ch); $crawloer = new Crawler($result); $products = $crawloer->filter(".Product"); foreach ($products as $product) { $crawler1 = new Crawler(); $crawler1->add($product); $productName = $crawler1->filter('a')->attr('title'); $price = $crawler1->filter('.Price')->text(); return ['productName' => $productName, 'price' => $price]; } }
/** * @return array */ function getLinks() { $crawler = new Crawler(); $crawler->add(file_get_contents($this->site)); $arrLinks = $crawler->filter('a')->each(function (Crawler $nodeCrawler) { return [$nodeCrawler->filter('a')->attr('href')]; }); $validLinks = []; $i = 0; foreach ($arrLinks as $k => $url) { $url[0] = str_replace('/redirect.php?url=', '', $url[0]); if (!filter_var($url[0], FILTER_VALIDATE_URL)) { if (@get_headers($url[0])[0] == 'HTTP/1.1 200 OK') { $validLinks[$i . '.' . $this->config['dataFormat']] = $url[0]; } else { if (@get_headers($this->site . $url[0])[0] == 'HTTP/1.1 200 OK') { $validLinks[$i . '.' . $this->config['dataFormat']] = $this->site . $url[0]; } } $i++; } } return $validLinks; }
/** * @covers Symfony\Component\DomCrawler\Crawler::add */ public function testAdd() { if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) { $this->markTestSkipped('The "CssSelector" component is not available'); } $crawler = new Crawler(); $crawler->add($this->createDomDocument()); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from a \\DOMDocument'); $crawler = new Crawler(); $crawler->add($this->createNodeList()); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from a \\DOMNodeList'); foreach ($this->createNodeList() as $node) { $list[] = $node; } $crawler = new Crawler(); $crawler->add($list); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from an array of nodes'); $crawler = new Crawler(); $crawler->add($this->createNodeList()->item(0)); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->add() adds nodes from an \\DOMNode'); $crawler = new Crawler(); $crawler->add('<html><body>Foo</body></html>'); $this->assertEquals('Foo', $crawler->filter('body')->text(), '->add() adds nodes from a string'); }
/** * @expectedException \InvalidArgumentException */ public function testAddInvalidType() { $crawler = new Crawler(); $crawler->add(1); }
/** * @param $content string * * @return string */ private function removeLastItem($content) { $document = new \DOMDocument('1.0', \Yii::$app->charset); $crawler = new Crawler(); $crawler->addHTMLContent($content, \Yii::$app->charset); $root = $document->appendChild($document->createElement('_root')); $crawler->rewind(); $root->appendChild($document->importNode($crawler->current(), true)); $domxpath = new \DOMXPath($document); $crawlerInverse = $domxpath->query(CssSelector::toXPath($this->widgetItem . ':last-child')); foreach ($crawlerInverse as $key => $elementToRemove) { $parent = $elementToRemove->parentNode; $parent->removeChild($elementToRemove); } $crawler->clear(); $crawler->add($document); return $crawler->filter('body')->eq(0)->html(); }
/** * @expectedException \InvalidArgumentException * @expectedExceptionMessage Nodes set in a Crawler must be DOMElement or DOMDocument instances, "DOMNode" given. */ public function testAddInvalidNode() { $crawler = new Crawler(); $crawler->add(new \DOMNode()); }
public function extractAction($html) { $crawler = new Crawler(); $crawler->add($html); /*$crawler = $crawler->filter('body')->nextAll(); foreach ($crawler as $domElement) { $nodeValue = $domElement->nodeValue; }*/ $readData = $crawler->filterXpath('//body/p')->extract(array('_text', 'class')); /* * print_r($readData); - Array ( [0] => Array ( [0] => Hello World! [1] => message ) [1] => Array ( [0] => Hello Crawler! [1] => ) [2] => Array * * Throw empty data from array readData and create arrays $showDataEven and $showDataOdd */ /* Prvi nacin $i = 0; $j = 0; $showData = array(); foreach($readData as $row){ foreach($row as $key=>$value) { if ($key == 0) { if(($j % 2) == 0){ $showDataEven[$i] = $value; $showDataA[$a][$b] = $value; $b++; }else{ $showDataOdd[$i] = $value; $i++; $showDataA[$a][$b] = $value; $a++; $b = 0; } } } $j++; } */ /* * $showDataEven: Array ( [0] => Hello World! [1] => Hello World2! [2] => Hello World3! [3] => Hello World4! ) * and $showDataOdd: Array ( [0] => Hello Crawler! [1] => Hello Crawler2! [2] => Hello Crawler3! [3] => Hello Crawler4! ) */ /*$i = 0; $j = 0; foreach($showDataEven as $keyeven=>$valueeven){ $showData[$i][$j] = $valueeven; $j++; foreach($showDataOdd as $keyodd=>$valueodd){ if($keyeven == $keyodd){ $showData[$i][$j] = $valueodd; $j = 0; } } $i++; }*/ $a = 0; $b = 0; /* Drugi nacin */ $showDataA = array(); foreach ($readData as $row) { foreach ($row as $key => $value) { if ($key == 0) { if ($b % 2 == 0) { $showDataA[$a][$b] = $value; $b++; } else { $showDataA[$a][$b] = $value; $a++; $b = 0; } } } } //print_r($showDataA); return $showDataA; //$crawler = $crawler->filter('body')->children()->text(); //return $crawler; }
/** * @param integer $page * * @return Crawler */ private function doRequest($page) { $response = $this->browser->get($this->buildUrl($page)); $crawler = new Crawler(); $crawler->add($response->toDomDocument()); return $crawler; }
/** * Adds a node to the current list of nodes. * * This method uses the appropriate specialized add*() method based * on the type of the argument. * * Overwritten from parent to allow Crawler to be added * * @param null|\DOMNodeList|array|\DOMNode|Crawler $node A node * * @api */ public function add($node) { if ($node instanceof Crawler) { foreach ($node as $childnode) { $this->addNode($childnode); } } else { parent::add($node); } }
/** * Mark the test failed and outputs the HTTP response's body. * * @param string $selector */ protected function debugResponse($selector = "body") { if ($this->getResponseBody()) { $crawler = new Crawler(); $crawler->add($this->getResponseBody()); $message = "Response debug:\n"; $message .= $crawler->filter($selector)->text(); $this->fail($message); } $this->fail("No response to debug"); }
public function extractAction($url) { //$html = htmlspecialchars_decode($url); //print_r($html); $crawler = new Crawler(); $crawler->add($url); /*$crawler = $crawler->filter('body')->nextAll(); foreach ($crawler as $domElement) { $nodeValue = $domElement->nodeValue; }*/ /* * * filterXpath('//html/body/div/div/form/div/div/table/tbody/tr/td/a/img') * * */ $readData = $crawler->filterXpath('//html/body/div/div/form/div/div'); /* $readData0 = $crawler ->filterXpath('//html/body/div/div/form/div/div') ->extract(array('_text', 'class')) ; print_r($readData0); */ $html = ''; foreach ($readData as $domElement) { $html .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html); // /html/body/div/table $readData1 = $crawler->filterXpath('//html/body/div/table/tr/th'); $readData11 = $crawler->filterXpath('//html/body/div/table/tr/th')->extract(array('_text', 'class')); /**** getting ID: to first array $showData1 *****/ $showData1 = array(); $j1 = 0; foreach ($readData11 as $keyrow => $valuerow) { if ($keyrow % 2 == 0) { foreach ($valuerow as $keyid => $valueid) { if ($keyid % 2 == 0) { $showData1[$j1] = $valueid; } } $j1++; } } /******** Reading data from table - tr-td - $crawler->add($html);********/ // /html/body/div/table $readData3 = $crawler->filterXpath('//html/body/div/table/tr/td'); $readData33 = $crawler->filterXpath('//html/body/div/table/tr/td')->extract(array('_text', 'class')); /**** getting Description: to third array $showData3 *****/ $showData3 = array(); $j3 = 0; foreach ($readData33 as $keyrow => $valuerow) { if ($keyrow % 2 == 0) { foreach ($valuerow as $keydesc => $valuedesc) { if ($keydesc % 2 == 0) { $showData3[$j3] = $valuedesc; } } $j3++; } } /* $html3 = ''; foreach ($readData3 as $domElement) { $html3 .= $domElement->ownerDocument->saveHTML($domElement); } */ /******** Reading data URL from table - tr-td - $crawler->add($html);********/ // /html/body/div/table $readData4 = $crawler->filterXpath('//html/body/div/table/tr/td/a/img'); $readData44 = $crawler->filterXpath('//html/body/div/table/tr/td/a/img')->extract(array('src', 'img')); /**** getting URL: to fourth array $showData4 *****/ $showData4 = array(); $j4 = 0; foreach ($readData44 as $keyrow => $valuerow) { foreach ($valuerow as $keyurl => $valueurl) { if ($keyurl % 2 == 0) { $showData4[$j4] = $valueurl; } } $j4++; } $html4 = ''; foreach ($readData4 as $domElement) { $html4 .= $domElement->ownerDocument->saveHTML($domElement); } /******** Reading data from table - tr-th - input - $crawler->add($html1);********/ // /html/body/div/table $html1 = ''; foreach ($readData1 as $domElement) { $html1 .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html1); $readData2 = $crawler->filterXpath('//html/body/th/input'); $readData22 = $crawler->filterXpath('//html/body/th/input')->extract(array('value', 'input')); /* $reducedSubsetCrawler = $crawler->reduce(function (Crawler $crawler, $i) { // Just return `false` if you want to remove an element from a set: return preg_match('/^value/', $crawler->attr('input')); }); $newCrawler = $crawler->filter('input[type=text]') ->first(); */ /**** getting Title: to second array $showData2 *****/ $showData2 = array(); $j2 = 0; foreach ($readData22 as $keyrow => $valuerow) { foreach ($valuerow as $keyid => $valueid) { if ($keyid % 2 == 0) { $showData2[$j2] = $valueid; } } $j2++; } $html2 = ''; foreach ($readData2 as $domElement) { $html2 .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html2); //$more = $reducedSubsetCrawler->filter('a > img')->first(); /*********** Create array of array to return to controller **************/ $showData = array($showData1, $showData2, $showData3, $showData4); $i = count($showData[0]); //print_r($i); $showDataD1 = $showData[0]; $showDataD2 = $showData[1]; $showDataD3 = $showData[2]; $showDataD4 = $showData[3]; $showDataA = array(); for ($j = 0; $j < $i; $j++) { $showDataA[$j][0] = $showDataD1[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][1] = $showDataD2[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][2] = $showDataD3[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][3] = $showDataD4[$j]; } //print_r($showDataD1); //print_r($showData); //print_r($showDataA); return $showDataA; //$crawler = $crawler->filter('body')->children()->text(); //return $crawler; }