/** * Search for torrents. * * @param string $query * @param int $category * @return array Array of torrents. Either empty or filled. */ public function search($query, $category) { # Set single-cell view for torrents. $requestOptions = ['headers' => ['User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36']]; try { $url = $this->makeUrl($query, $category); $response = $this->httpClient->get($url, $requestOptions); $crawler = new Crawler((string) $response->getBody()); } catch (\Exception $e) { return []; } $items = $crawler->filterXpath('//channel/item'); $torrents = []; foreach ($items as $item) { $torrent = new Torrent(); $itemCrawler = new Crawler($item); // Set details for torrent. $torrent->setSite($this->tag); $torrent->setTitle($itemCrawler->filterXpath('//title')->text()); $torrent->setSeeders((int) $itemCrawler->filterXpath('//torrent:seeds')->text()); $torrent->setLeechers((int) $itemCrawler->filterXpath('//torrent:peers')->text()); $torrent->setMagnet($itemCrawler->filterXpath('//torrent:magnetURI')->text()); $torrent->setSize($this->formatBytes((int) $itemCrawler->filterXPath('//torrent:contentLength')->text())); $torrent->setAge($itemCrawler->filterXPath('//pubDate')->text()); $torrent->setCategory($itemCrawler->filterXPath('//category')->text()); $torrents[] = $torrent; } return $torrents; }
public function getTitleFallback(Crawler $crawler) { $title = ''; if ($crawler->filterXpath('//title')->count()) { $title = $crawler->filterXpath('//title')->text(); } return $title; }
/** * @param string $query * @return SearchResult[] */ public function search($query) { try { $response = $this->httpClient->get('https://thepiratebay.se/search/' . urlencode($query) . '/0/7/0'); } catch (ClientException $e) { return []; } $crawler = new Crawler((string) $response->getBody()); $items = $crawler->filter('#searchResult tr'); $results = []; $first = true; foreach ($items as $item) { // Ignore the first row, the header if ($first) { $first = false; continue; } $result = new SearchResult(); $itemCrawler = new Crawler($item); $result->setName(trim($itemCrawler->filter('.detName')->text())); $result->setSeeders((int) $itemCrawler->filter('td')->eq(2)->text()); $result->setLeechers((int) $itemCrawler->filter('td')->eq(3)->text()); $result->setMagnetUrl($itemCrawler->filterXpath('//tr/td/a')->attr('href')); $results[] = $result; } return $results; }
function crawl($url, $afterCrawl = null, $beforeCrawl = null) { $md5Url = md5($url); try { $this->crawledUrls[] = $md5Url; $response = $this->client->request("GET", $url); $html = $response->getBody()->getContents(); $this->requestCount++; echo $this->requestCount . "\n"; unset($response); if (isset($afterCrawl)) { $afterCrawl($url, $html); } $domCrawler = new DomCrawler($html); unset($html); $urlsToCrawl = array_unique($domCrawler->filterXpath('//a')->extract(['href'])); unset($domCrawler); foreach ($urlsToCrawl as $urlToCrawl) { $urlNormalized = $this->normalizeUrl($url, $urlToCrawl); if ($this->maxRequestcount != 0 && $this->requestCount >= $this->maxRequestcount) { return; } if ($this->isCrawlable($url, $urlNormalized)) { if (isset($beforeCrawl) && !$beforeCrawl($urlNormalized)) { continue; } $this->crawl($urlNormalized, $afterCrawl, $beforeCrawl); } } } catch (\Exception $e) { $this->errorUrls[] = $url; } }
public function load($gameweek) { $this->gameweek = $gameweek; $crawler = $this->getDom("http://fantasy.premierleague.com/fixtures/{$gameweek}/", ['X-Requested-With' => 'XMLHttpRequest']); $deadline = $crawler->filter('.ismStrongCaption')->text(); $this->deadline_time = $this->parseDate(array_pop(explode(' - ', $deadline))); $games = $crawler->filter('.ismFixture'); foreach ($games as $g) { $gc = new Crawler($g); $match = new GameweekMatch(); $home_team_id = (int) preg_replace('~^.+badge_(\\d+).+$~', '$1', $gc->filterXpath('//td[3]/img')->attr('src')); $away_team_id = (int) preg_replace('~^.+badge_(\\d+).+$~', '$1', $gc->filterXpath('//td[5]/img')->attr('src')); $match->home_team = new TeamSimple(); $match->home_team->load($home_team_id); $match->away_team = new TeamSimple(); $match->away_team->load($away_team_id); $match->start_time = $this->parseDate($gc->filterXpath('//td[1]')->text()); $this->matches[] = $match; } }
/** * {@inheritdoc} */ public function lookup($id) { if (!class_exists('Symfony\\Component\\DomCrawler\\Crawler')) { throw new RuntimeException('symfony/dom-crawler is required.'); } $url = sprintf('https://play.google.com/store/apps/details?id=%s', $id); $response = $this->exec($url); if (404 === $this->getInfo(CURLINFO_HTTP_CODE)) { throw new NotFoundException($id); } $this->close(); $crawler = new Crawler($response); $name = $crawler->filterXpath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' document-title ')]/descendant::div")->text(); $owner = $crawler->filterXpath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' document-subtitle ') and (contains(concat(' ', normalize-space(@class), ' '), ' primary '))]/descendant::span")->text(); $description = $crawler->filterXpath('//*[@id="body-content"]/div/div/div[1]/div[1]/div/div[3]/div[1]/div[1]/div/div[1]')->text(); $screenshots = $crawler->filterXpath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' details-section ') and (contains(concat(' ', normalize-space(@class), ' '), ' screenshots '))]/descendant::*[contains(concat(' ', normalize-space(@class), ' '), ' screenshot-container ')]/descendant::img")->each(function ($node, $i) { if (0 === $i) { return; } return $node->attr('src'); }); $screenshots = array_filter($screenshots); $tags = array_map('strtolower', (array) $crawler->filterXpath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' document-subtitle ') and (contains(concat(' ', normalize-space(@class), ' '), ' category '))]/descendant::span")->text()); $cover = $crawler->filterXpath("descendant-or-self::*[contains(concat(' ', normalize-space(@class), ' '), ' details-info ')]/descendant::*[contains(concat(' ', normalize-space(@class), ' '), ' cover-container ')]/descendant::img")->attr('src'); return new AppInfo($id, $name, $owner, $description, $cover, $tags, $screenshots, AppInfo::PLATFORM_ANDROID); }
public function populateLeagues($crawler) { $types = []; foreach ($crawler->filterXPath('//*[@class="ismSecondary"]/*[@class="ismTableHeading"]') as $h) { $c = new Crawler($h); if (preg_match('~ leagues$~', $c->text())) { $types[] = strtolower(preg_replace('~ leagues$~', '', $c->text())); } } foreach ($crawler->filterXPath('//*[@class="ismTable ismLeagueTable"]') as $i => $t) { $table = new Crawler($t); $leagues = $table->filterXpath('//tbody/tr'); foreach ($leagues as $l) { $c = new Crawler($l); $league = new UserLeague(); $league->type = $types[$i]; $league->populate($c); $this->leagues[] = $league; } } }
/** * Search for torrents. * * @param string $query * @param int $category * @return array Array of torrents. Either empty or filled. */ public function search($query, $category) { # Set single-cell view for torrents. $requestOptions = ['headers' => ['User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'], 'cookies' => ['lw' => 's']]; try { $url = $this->makeUrl($query, $category); $response = $this->httpClient->get($url, $requestOptions); $crawler = new Crawler((string) $response->getBody()); } catch (\Exception $e) { // TODO: Log error. Some error has occured. return []; } $items = $crawler->filter('#searchResult tr'); $torrents = []; $firstRow = true; foreach ($items as $item) { // Ignore the first row. if ($firstRow) { $firstRow = false; continue; } $torrent = new Torrent(); $itemCrawler = new Crawler($item); // Set details for torrent. $torrent->setSite($this->tag); $torrent->setTitle(trim($itemCrawler->filter('td')->eq(1)->text())); $torrent->setSeeders((int) $itemCrawler->filter('td')->eq(5)->text()); $torrent->setLeechers((int) $itemCrawler->filter('td')->eq(6)->text()); $torrent->setMagnet($itemCrawler->filterXpath('/td[3]/a[0]')->attr('href')); $torrent->setSize($itemCrawler->filter('td')->eq(4)->text()); $torrent->setAge($itemCrawler->filterXPath('/td[2]')->text()); $torrent->setCategory($itemCrawler->filterXPath('/td[0]')->text()); $torrents[] = $torrent; } return $torrents; }
/** * Get all links in the given html. * * @param string $html * * @return \Spatie\Crawler\Url[] */ protected function getAllLinks($html) { $domCrawler = new DomCrawler($html); return collect($domCrawler->filterXpath('//a')->extract(['href']))->map(function ($url) { return Url::create($url); }); }
/** * @param $html * @param $selector * @param $urlAttribute * @param $type * @param $parentPage * @return array */ protected function createAssetsFromDOMElements($html, $selector, $urlAttribute, $type, $parentPage) { $assets = []; $crawler = new Crawler($html); $elements = $crawler->filterXpath($selector); /** @var \DOMElement $assetElement */ foreach ($elements as $element) { if (!empty($element->getAttribute($urlAttribute))) { $urlValue = $element->getAttribute($urlAttribute); if ($this->config->ignoreWhiteSpaces) { $urlValue = trim($urlValue); } $assets[] = new Asset($urlValue, $parentPage, $element->ownerDocument->saveHTML($element), $type); } } return $assets; }
/** * @return $this|null */ public function findImage() { $crawler = new Crawler($this->html); $images = $crawler->filterXpath('//img')->extract(['src']); if (empty($images)) { return null; } $this->image_uri = $this->getUri($images[0]); return $this; }
public function extractAction($html) { $crawler = new Crawler(); $crawler->add($html); /*$crawler = $crawler->filter('body')->nextAll(); foreach ($crawler as $domElement) { $nodeValue = $domElement->nodeValue; }*/ $readData = $crawler->filterXpath('//body/p')->extract(array('_text', 'class')); /* * print_r($readData); - Array ( [0] => Array ( [0] => Hello World! [1] => message ) [1] => Array ( [0] => Hello Crawler! [1] => ) [2] => Array * * Throw empty data from array readData and create arrays $showDataEven and $showDataOdd */ /* Prvi nacin $i = 0; $j = 0; $showData = array(); foreach($readData as $row){ foreach($row as $key=>$value) { if ($key == 0) { if(($j % 2) == 0){ $showDataEven[$i] = $value; $showDataA[$a][$b] = $value; $b++; }else{ $showDataOdd[$i] = $value; $i++; $showDataA[$a][$b] = $value; $a++; $b = 0; } } } $j++; } */ /* * $showDataEven: Array ( [0] => Hello World! [1] => Hello World2! [2] => Hello World3! [3] => Hello World4! ) * and $showDataOdd: Array ( [0] => Hello Crawler! [1] => Hello Crawler2! [2] => Hello Crawler3! [3] => Hello Crawler4! ) */ /*$i = 0; $j = 0; foreach($showDataEven as $keyeven=>$valueeven){ $showData[$i][$j] = $valueeven; $j++; foreach($showDataOdd as $keyodd=>$valueodd){ if($keyeven == $keyodd){ $showData[$i][$j] = $valueodd; $j = 0; } } $i++; }*/ $a = 0; $b = 0; /* Drugi nacin */ $showDataA = array(); foreach ($readData as $row) { foreach ($row as $key => $value) { if ($key == 0) { if ($b % 2 == 0) { $showDataA[$a][$b] = $value; $b++; } else { $showDataA[$a][$b] = $value; $a++; $b = 0; } } } } //print_r($showDataA); return $showDataA; //$crawler = $crawler->filter('body')->children()->text(); //return $crawler; }
public function extractAction($url) { //$html = htmlspecialchars_decode($url); //print_r($html); $crawler = new Crawler(); $crawler->add($url); /*$crawler = $crawler->filter('body')->nextAll(); foreach ($crawler as $domElement) { $nodeValue = $domElement->nodeValue; }*/ /* * * filterXpath('//html/body/div/div/form/div/div/table/tbody/tr/td/a/img') * * */ $readData = $crawler->filterXpath('//html/body/div/div/form/div/div'); /* $readData0 = $crawler ->filterXpath('//html/body/div/div/form/div/div') ->extract(array('_text', 'class')) ; print_r($readData0); */ $html = ''; foreach ($readData as $domElement) { $html .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html); // /html/body/div/table $readData1 = $crawler->filterXpath('//html/body/div/table/tr/th'); $readData11 = $crawler->filterXpath('//html/body/div/table/tr/th')->extract(array('_text', 'class')); /**** getting ID: to first array $showData1 *****/ $showData1 = array(); $j1 = 0; foreach ($readData11 as $keyrow => $valuerow) { if ($keyrow % 2 == 0) { foreach ($valuerow as $keyid => $valueid) { if ($keyid % 2 == 0) { $showData1[$j1] = $valueid; } } $j1++; } } /******** Reading data from table - tr-td - $crawler->add($html);********/ // /html/body/div/table $readData3 = $crawler->filterXpath('//html/body/div/table/tr/td'); $readData33 = $crawler->filterXpath('//html/body/div/table/tr/td')->extract(array('_text', 'class')); /**** getting Description: to third array $showData3 *****/ $showData3 = array(); $j3 = 0; foreach ($readData33 as $keyrow => $valuerow) { if ($keyrow % 2 == 0) { foreach ($valuerow as $keydesc => $valuedesc) { if ($keydesc % 2 == 0) { $showData3[$j3] = $valuedesc; } } $j3++; } } /* $html3 = ''; foreach ($readData3 as $domElement) { $html3 .= $domElement->ownerDocument->saveHTML($domElement); } */ /******** Reading data URL from table - tr-td - $crawler->add($html);********/ // /html/body/div/table $readData4 = $crawler->filterXpath('//html/body/div/table/tr/td/a/img'); $readData44 = $crawler->filterXpath('//html/body/div/table/tr/td/a/img')->extract(array('src', 'img')); /**** getting URL: to fourth array $showData4 *****/ $showData4 = array(); $j4 = 0; foreach ($readData44 as $keyrow => $valuerow) { foreach ($valuerow as $keyurl => $valueurl) { if ($keyurl % 2 == 0) { $showData4[$j4] = $valueurl; } } $j4++; } $html4 = ''; foreach ($readData4 as $domElement) { $html4 .= $domElement->ownerDocument->saveHTML($domElement); } /******** Reading data from table - tr-th - input - $crawler->add($html1);********/ // /html/body/div/table $html1 = ''; foreach ($readData1 as $domElement) { $html1 .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html1); $readData2 = $crawler->filterXpath('//html/body/th/input'); $readData22 = $crawler->filterXpath('//html/body/th/input')->extract(array('value', 'input')); /* $reducedSubsetCrawler = $crawler->reduce(function (Crawler $crawler, $i) { // Just return `false` if you want to remove an element from a set: return preg_match('/^value/', $crawler->attr('input')); }); $newCrawler = $crawler->filter('input[type=text]') ->first(); */ /**** getting Title: to second array $showData2 *****/ $showData2 = array(); $j2 = 0; foreach ($readData22 as $keyrow => $valuerow) { foreach ($valuerow as $keyid => $valueid) { if ($keyid % 2 == 0) { $showData2[$j2] = $valueid; } } $j2++; } $html2 = ''; foreach ($readData2 as $domElement) { $html2 .= $domElement->ownerDocument->saveHTML($domElement); } $crawler = new Crawler(); $crawler->add($html2); //$more = $reducedSubsetCrawler->filter('a > img')->first(); /*********** Create array of array to return to controller **************/ $showData = array($showData1, $showData2, $showData3, $showData4); $i = count($showData[0]); //print_r($i); $showDataD1 = $showData[0]; $showDataD2 = $showData[1]; $showDataD3 = $showData[2]; $showDataD4 = $showData[3]; $showDataA = array(); for ($j = 0; $j < $i; $j++) { $showDataA[$j][0] = $showDataD1[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][1] = $showDataD2[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][2] = $showDataD3[$j]; } for ($j = 0; $j < $i; $j++) { $showDataA[$j][3] = $showDataD4[$j]; } //print_r($showDataD1); //print_r($showData); //print_r($showDataA); return $showDataA; //$crawler = $crawler->filter('body')->children()->text(); //return $crawler; }
/** * @param $html * @return array */ private function getFirstPageAttributes($html) { $crawler = new DomCrawler\Crawler($html); $nodeValues = $crawler->filterXpath(self::PRODUCT_XPATH)->each(function (DomCrawler\Crawler $node, $i) { $descXpath = '//div[contains(concat(" ", normalize-space(@class), " "), " productInfo ")]/h3/a'; $priceXpath = '//p[contains(concat(" ", normalize-space(@class), " "), " pricePerUnit ")]'; $priceRegEx = '/([0-9]+[.|,][0-9])|([0-9][.|,][0-9]+)|([0-9]+)/i'; $thisLink = $node->filterXPath($descXpath)->first(); $thisPriceText = trim($node->filterXPath($priceXpath)->first()->text()); preg_match($priceRegEx, $thisPriceText, $priceMatch); return array('title' => trim($thisLink->text()), 'link' => $thisLink->attr('href'), 'price' => $priceMatch[0]); }); return $nodeValues; }