public function run() { $c = new Color(); $crawler = new Crawler(); $response = $this->client->request('GET', $this->search . '&start=' . $this->getLimit()); $crawler->addContent((string) $response->getBody()); $data = file_get_contents(ROOT . '/lib/results.csv'); foreach ($crawler->filter('cite') as $url) { $row = ['name' => '', 'd7' => '', 'd8' => '', 'url' => '']; echo "Found: {$url->nodeValue}" . PHP_EOL; // Attempt to make a request to D.O to get details about the module. $res = $this->client->request('GET', $url->nodeValue); if ($res->getStatusCode() > 400) { echo $c("Unable to fetch data")->red() . PHP_EOL; continue; } $body = (string) $res->getBody(); if (empty($body)) { echo $c('Unable to fetch body')->red() . PHP_EOL; continue; } $crawl = new Crawler(); $crawl->addContent($res->getBody()); // Add the known elements. $row['name'] = trim($crawl->filter('#page-subtitle')->text()); $row['url'] = trim($url->nodeValue); // The help block often has information about the status to D8. if (count($crawl->filter('.help'))) { $help = $crawl->filter('.help')->text(); if (strpos($help, 'ported to Drupal 8') > -1) { $row['d8'] = 'In progress'; } } foreach ($crawl->filter('[data-th="Version"]') as $version) { $version = $version->nodeValue; if (strpos($version, '7.x') > -1) { $row['d7'] = trim($version); continue; } if (strpos($version, '8.x') > -1) { $row['d8'] = trim($verison); continue; } } // This module hasn't been ported to D7 - so continue. if (empty($row['d7']) && empty($row['d8'])) { echo $c('<bg_yellow>This is a Drupal 6 module</bg_yellow>')->colorize() . PHP_EOL; continue; } $data .= implode(',', array_values($row)) . "\n"; echo $c('Successfully added metadata')->green() . PHP_EOL; } $h = fopen(ROOT . '/lib/results.csv', 'w'); fwrite($h, $data); // Increment the limit. $limit = $this->getLimit() + 10; echo $c('Updating the limit from <yellow>' . $this->getLimit() . '</yellow> to <yellow>' . $limit . '</yellow>')->colorize() . PHP_EOL; $this->setLimit($limit); }
/** * Lazy loads a Crawler object based on the ResponseInterface; * @return Crawler */ public function getCrawler() { if (!$this->crawler instanceof Crawler) { $this->crawler = new Crawler('', $this->getUri()->toString()); $this->crawler->addContent($this->getResponse()->getBody()->__toString(), $this->getResponse()->getHeaderLine('Content-Type')); } return $this->crawler; }
/** * MetaDataHelper constructor. * * @param string|Crawler $data * * @throws \Exception */ public function __construct($data) { if (!$data instanceof Crawler && !is_string($data)) { throw new \Exception('Parameter must be either a string or an instance of \\Symfony\\Component\\DomCrawler\\Crawler\'.'); } if ($data instanceof Crawler) { $this->crawler = $data; } if (is_string($data)) { $this->crawler = new Crawler(); $this->crawler->addContent($data); } }
/** * @inheritdoc */ protected function parse(Requests_Response $requests) { $crawler = new Crawler(); $crawler->addContent($requests->body); $r = $crawler->filter("#page > main > section > div > div.result-item-list article a > .box-row"); $results = array(); /** @var DOMElement $el */ foreach ($r as $el) { $c = new Crawler(); $c->add($el); $tags = []; /** @var DOMElement $z */ foreach ($c->filter(".box-row ul.box-row-item-attribute-list li") as $z) { if ($z->childNodes !== null && $z->childNodes->length >= 4) { $tags[] = $z->childNodes->item(1)->nodeValue . ": " . $z->childNodes->item(3)->nodeValue; } } $addressB = $c->filter(".item-title--street"); $address = $addressB->text() . " " . $addressB->siblings()->text(); $tags[] = "Adresse: " . $address; $result = new Result(); $result->setTags($tags); $result->setTitle(trim($c->filter("h2")->text())); if ($c->filter("item-description p")->valid()) { $result->setDescription($c->filter("item-description p")->text()); } $link = $el->parentNode->attributes->getNamedItem("href")->nodeValue; $result->setId($this->getName() . "_" . explode("/", $link)[2]); $result->setUrl("http://m.homegate.ch/" . $link); $results[] = $result; } return $results; }
/** * @param string $url The url to scrape. * @return \Slice\CliApp\ScrapeResults The results of the scrape task. */ public function getProductsForUrl($url) { //Grab the remote document contents $rawHTML = $this->downloader->download($url); //Drop it into a DOM crawler $crawler = new Crawler(); $crawler->addContent($rawHTML); try { //Use xPath to find all of the product li elements $productList = $crawler->filterXPath($this->productListXpath); } catch (\InvalidArgumentException $e) { //Convert into a Scrape Exception for easy handling by the command throw new ScrapeException($this->configValues['error_msg']['product_parse_error']); } //If there are none the page isn't supported if (sizeof($productList) == 0) { throw new ScrapeException($this->configValues['error_msg']['no_products']); } //Loop over each product li $productList->each(function ($liCrawler, $i) { try { //Find the product detail page url from the link $productURL = $liCrawler->filterXPath($this->pdpLinkXpath)->attr('href'); } catch (\InvalidArgumentException $e) { //Convert into a Scrape Exception for easy handling by the command throw new ScrapeException($this->configValues['error_msg']['product_parse_error']); } $product = $this->pdpParser->parseUrl($productURL); //Populate the final results container $this->results->addProduct($product); }); return $this->results; }
public function setLaundryState(&$laundryPlace) { $user = '******'; $pass = '******'; try { $client = new Client($laundryPlace['url']); $request = $client->get('/LaundryState', [], ['auth' => [$user, $pass, 'Digest'], 'timeout' => 1.5, 'connect_timeout' => 1.5]); $response = $request->send(); $body = $response->getBody(); libxml_use_internal_errors(true); $crawler = new Crawler(); $crawler->addContent($body); foreach ($crawler->filter('img') as $img) { $resource = $img->getAttribute('src'); $img->setAttribute('src', 'http://129.241.126.11/' . trim($resource, '/')); } $crawler->addHtmlContent('<h1>foobar</h1>'); //'<link href="http://129.241.126.11/pic/public_n.css" type="text/css">'); $laundryPlace['html'] = $crawler->html(); libxml_use_internal_errors(false); preg_match_all('/bgColor=Green/', $body, $greenMatches); preg_match_all('/bgColor=Red/', $body, $redMatches); $laundryPlace['busy'] = count($redMatches[0]); $laundryPlace['available'] = count($greenMatches[0]); } catch (\Exception $e) { $laundryPlace['available'] = self::NETWORK_ERROR; $laundryPlace['busy'] = self::NETWORK_ERROR; $laundryPlace['html'] = self::NETWORK_ERROR; } }
/** * @inheritdoc */ protected function parse(Requests_Response $requests) { $crawler = new Crawler(); $crawler->addContent($requests->body); $r = $crawler->filterXPath('//*[@id="content"]/div/div[2]/div[1]/div[1]/ul/li'); $results = array(); /** @var DOMElement $el */ foreach ($r as $el) { $c = new Crawler(); $c->add($el); $tags = []; /** @var DOMElement $z */ foreach ($c->filter(".horizontal-separated-list li") as $z) { $tags[] = $z->textContent; } $result = new Result(); $result->setTitle(trim($c->filter(".details a")->text())); $result->setTags($tags); $relUrl = $c->filter(".details a")->attr("href"); $id = explode("--", explode("/", parse_url($relUrl)["path"])[2])[1]; $result->setId($this->getName() . "_" . intval($id)); $result->setUrl("http://www.anibis.ch/" . $relUrl); $result->setPrice($c->filter(".price")->text()); $result->setDescription($c->filter(".details .description")->text()); $results[] = $result; } return $results; }
/** * Make a request to the application and create a Crawler instance. * * @param string $method * @param string $uri * @param array $parameters * @param array $cookies * @param array $files * * @return $this */ private function makeRequest($method, $uri, $parameters = [], $cookies = [], $files = []) { $this->initialUri = $uri; $this->currentUri = $uri; $onRedirect = function (RequestInterface $request, ResponseInterface $response, UriInterface $uri) { $this->currentUri = sprintf('%s', $uri); }; $options = ['allow_redirects' => ['max' => 10, 'strict' => true, 'referer' => true, 'on_redirect' => $onRedirect, 'track_redirects' => true]]; $this->matcher = new PlainTextMatcher($this); $this->client = new Client(); $this->request = $this->client->get($uri, $options); $this->response = $this->request; $this->body = $this->response->getBody()->getContents(); $this->crawler = new Crawler(); $this->crawler->addContent($this->body); $this->metaDataHelper = new MetaDataHelper($this->crawler); return $this; }
/** * Execute the console command. * * @return mixed */ public function handle() { foreach (RssFeed::limit(1)->get() as $feed) { $this->crawler->addContent($feed->html); foreach ($this->pointers as $key => $value) { try { $data[$key] = $this->crawler->filterXPath($value)->text(); } catch (\Exception $e) { $data[$key] = null; } } $data['url'] = $feed->url; $organisation = $this->saveOrganisation($data); $location = $this->saveLocation(); $organisation->location()->save($location); $vacancy = Vacancy::firstOrNew(['ref' => $data['ref']]); } }
public function __construct(ResponseInterface $res, $url) { $this->url = $url; $body = Encoding::convertToUtf8($res->getBody()); $this->rawDocument = $body; $crawler = new Crawler(); $crawler->addContent($body); $this->document = $crawler; }
/** * Upload and save word content into article * * @param Array $files * @param String $type * @return boolean */ public function convert($files, $type) { foreach ($files as $file) { $fileName = $file["fileName"]; $path = $file["path"]; $document = new Document($fileName, $path); $this->addDocument($document); $this->execute(); } foreach ($files as $file) { $fileName = $file["fileName"]; $document = $this->getDocument($fileName); $metadata = $document->getMetadata(); $author = $metadata->get("Author"); $title = $metadata->get("dc:title"); if ($title == "") { $this->crawler->addContent($document->getContent()); $title = $this->crawler->filter('body p:first-child b')->text(); if ($title == "") { $title = $fileName; } } //Create new Article and add images as Files. $article = new Article(); $authors = new ArrayCollection(); $authors->add($author); //$article->setAuthors($authors); $article->setContentType($type); $article->setTitle($title); $article->setContent($document->getContent()); $references = new ArrayCollection(); $dm = $this->doctrineMongodb->getManager(); foreach ($document->images as $image) { $uploadedFile = new UploadedFile("upload/" . $document->getName() . "/" . $image, 'original', 'mime/original', 123, UPLOAD_ERR_OK, true); $file = new File(); $file->setFile($uploadedFile); $references->add($file); } //$article->setReferences($references); $dm->persist($article); $dm->flush(); } return true; }
public function testExtract() { $crawler = new Crawler(); $str = html_entity_decode(file_get_contents(__DIR__ . '/../../Resources/embedded_videos.html')); $crawler->addContent($str); $tubelink = TubeLink::create(); $extractor = new IframeExtractor($tubelink); $generated = $extractor->extract($crawler); $this->assertCount(3, $generated); }
private function parseTrendsFromResponse($fetched_body_encoded) { // var_dump($fetched_body_encoded); $fetched_body = json_decode($fetched_body_encoded, true)['module_html']; $crawler = new Crawler(); $crawler->addContent($fetched_body); $this->trends = $crawler->filter('.trend-item')->each(function (Crawler $node) { return $node->attr('data-trend-name'); }); }
/** * * @param string $body * @return Item[] */ public function parseItems($body) { $crawler = new Crawler(); $crawler->addContent($body); $nodes = $this->items ? $crawler->filter($this->items) : $crawler; $items = []; $nodes->each(function (Crawler $block) use(&$items) { $items[] = $this->parseItem($block); }); return $items; }
/** * @param string $html * @return \Lunchbot\Menu\LunchMenuItem[] */ public function parseHtml(string $html) : array { $crawler = new Crawler(); $crawler->addContent($html); $soupText = trim(preg_replace('~\\x{00a0}~siu', ' ', $crawler->filter('.entry-content table td')->text())); $result = [new LunchMenuItem($soupText)]; foreach ($crawler->filter('.entry-content ul:first-of-type li strong') as $node) { $result[] = new LunchMenuItem(trim($node->nodeValue)); } return $result; }
/** * @test */ public function itReceivesAInvalidCard() { $invalidResponse = ' <div class="alert alert-danger" role="alert"> De relatie <strong>invalidcard</strong> heeft geen (geldig) lidmaatschap. </div>'; $crawler = new Crawler(); $crawler->addContent($invalidResponse); $card = new Card('invalidcard'); $this->client->request('POST', 'https://mijn.nabv.nl/portal/controle', ['relation_number' => $card->toString()])->willReturn($crawler); $this->assertFalse($this->cardValidator->isValidCard(new Card('invalidcard'))); }
/** * Return text that matches a XPath expression. * * @param Result $result * * @return Result */ public function execute(Result $result) { $domCrawler = new DOMCrawler(); $domCrawler->addContent($result->getData()); $domCrawler->filterXPath($this->xpath)->each(function (DOMCrawler $node) { $text = trim($node->text()); if (!empty($text)) { echo $text . PHP_EOL; } }); return $result; }
/** * @param $string * @return array */ public function extract($string) { $images = []; $crawler = new Crawler(); $str = html_entity_decode($string); $crawler->addContent($str); foreach ($this->extractorList as $extractor) { $images = array_merge($images, $extractor->extract($crawler)); } foreach ($this->filterList as $filter) { $images = $filter->filter($images); } return $images; }
protected function discoverProfileUrisFromHtml($uri) { try { $response = $this->httpClient->get($uri)->send(); } catch (\Exception $e) { throw new \RuntimeException("Entity could not be found", 0, $e); } $crawler = new Crawler(); $crawler->addContent($response->getBody()); $profileUris = array(); foreach ($crawler->filter('link[rel="https://tent.io/rels/profile"]') as $link) { $profileUris[] = $link->getAttribute('href'); } return $profileUris; }
/** * Loads the specification from a XML file * * @param string $file Path to the file to load */ public function load($file) { if (!file_exists($file)) { throw new \RuntimeException(sprintf('The file "%s" does not exists', $file)); } $content = file_get_contents($file); // HACK: DOMNode seems to bug when a node is named "param" $content = str_replace('<param', '<parameter', $content); $content = str_replace('</param', '</parameter', $content); $crawler = new Crawler(); $crawler->addContent($content, 'xml'); foreach ($crawler->filterXPath('//function') as $node) { $method = $this->getMethod($node); $this->specification->addMethod($method); } }
/** * @Then I should have an activation email */ public function iShouldHaveAnActivationEmail() { $client = $this->recentClient; /** @var \Symfony\Bundle\SwiftMailerBundle\DataCollector\MessageDataCollector $mailCollector */ $mailCollector = $client->getProfile()->getCollector('swiftmailer'); Assertion::eq(1, $mailCollector->getMessageCount()); /** @var \Swift_Message $message */ $message = $mailCollector->getMessages()[0]; Assertion::eq($message->getSubject(), 'Sententiaregum Notifications'); Assertion::eq(key($message->getTo()), '*****@*****.**'); $crawler = new Crawler(); $crawler->addContent($message->getChildren()[1]->getBody()); Assertion::count($message->getChildren(), 2); Assertion::eq(1, $crawler->filter('#n-ac-l-p')->count()); Assertion::notEq(0, preg_match('/!\\/activate\\/(.*)/', $message->getChildren()[0]->getBody())); }
/** * @Then /^I should've gotten an email$/ */ public function checkEmail() { /** @var \Symfony\Bundle\SwiftmailerBundle\DataCollector\MessageDataCollector $profile */ $profile = $this->apiContext->getProfile()->getCollector('swiftmailer'); Assertion::eq(1, $profile->getMessageCount()); /** @var \Swift_Message $message */ $message = $profile->getMessages()[0]; // user registers as "de", so email should be in german Assertion::eq($message->getSubject(), 'Benachrichtigungen von Sententiaregum'); Assertion::eq(key($message->getTo()), '*****@*****.**'); $crawler = new Crawler(); $crawler->addContent($message->getChildren()[1]->getBody()); Assertion::count($message->getChildren(), 2); Assertion::eq(1, $crawler->filter('#n-ac-l-p')->count()); Assertion::notEq(0, preg_match('/\\/activate\\/(.*)/', $message->getChildren()[0]->getBody())); }
public function save(array $publications) { if (0 === count($publications)) { return; } $repository = $this->_em->getRepository('ECENetagoraBundle:KnownLink'); $references = $this->getExistingReferences($publications); foreach ($publications as $publication) { if (!in_array($publication->getReference(), $references)) { // Scrap the publication and get the latest uri try { $response = $this->browser->get($publication->getLinkUrl()); } catch (\Exception $e) { // The Http request failed... continue; } $urls = explode("\n", $response->getHeader('Location')); $url = array_pop($urls); $knownLink = $this->_em->getRepository('ECENetagoraBundle:KnownLink')->findOneByUrl($url); // We found a corresponding KnownLink that we can affect to the publication if ($knownLink) { $publication->setKnownLink($knownLink); $this->_em->persist($publication); continue; } // Otherwise, we need to create a new KnownLink and guess the category $crawler = new Crawler(); $crawler->addContent($response->getContent()); $guesser = new CategoryGuesser($url, $response, $crawler); $guesser->guess(); $category = $this->_em->getRepository('ECENetagoraBundle:Category')->findOneByType($guesser->getCategory()); /*if (null === $category) { var_dump($guesser->getScores());die; echo $guesser->getCategory();die; }*/ $knownLink = new KnownLink(); $knownLink->setCategory($category); $knownLink->setUrl($url); $knownLink->fromArray($guesser->getMetadata()); $publication->setKnownLink($knownLink); $this->_em->persist($knownLink); $this->_em->persist($publication); } } $this->_em->flush(); }
/** * @param string $html * @return \Lunchbot\Menu\LunchMenuItem[] */ public function parseHtml(string $html) : array { $crawler = new Crawler(); $crawler->addContent($html); $rows = $crawler->filter('table.dailyMenuTable tr')->each(function (Crawler $node) { return $node->children(); }); $menus = []; $currentKey = null; $skip = true; foreach ($rows as $item) { $headline = $item->text(); if (in_array($headline, ['POLÉVKY', 'HLAVNÍ JÍDLO', 'MENU 1', 'MENU 2', 'VEGETARIAN'], true)) { $currentKey = ucfirst(mb_strtolower(preg_replace('~(\\s+)~', ' ', $headline), 'UTF-8')); $menus[$currentKey] = []; $skip = false; continue; } elseif ($headline !== '' || $skip) { $skip = true; continue; } if ($headline === '') { $title = null; $price = null; foreach ($item as $child) { if ($child->nodeValue === '') { continue; } if ($title === null) { $title = trim($child->nodeValue); } elseif ($price === null) { $price = trim($child->nodeValue); } } $menus[$currentKey][] = new LunchMenuItem(sprintf('%s %s', $title, $price)); } } $result = []; foreach ($menus as $headline => $items) { if (count($items) > 0) { $result[] = new LunchMenuItem($headline); $result = array_merge($result, $items); } } return $result; }
/** * Called by the compile method to replace the image sources with image cache sources * * @param string $html * * @return string */ public function convert($html) { preg_match_all('|<img ([^>]+)>|', $html, $matches); foreach ($matches[0] as $img) { $crawler = new Crawler(); $crawler->addContent($img); $imgTag = $crawler->filter("img"); $src = $imgTag->attr('src'); $width = $imgTag->attr('width'); $height = $imgTag->attr('height'); if (!empty($width) || !empty($height)) { $format = $width . "x" . $height; $updatedTagString = preg_replace("| src=[\"']" . $src . "[\"']|", " src=\"" . $this->imResize($src, $format) . "\"", $img); $html = str_replace($img, $updatedTagString, $html); } } return $html; }
/** * Test whether array data is rendered correctly */ public function testRenderArrayData() { $grid = new DatagridKernel($this->makeArrayDataProvider($this->data), TwigEngineAdapter::getInstance()); $renderedData = $grid->render(); $crawler = new Crawler(); $crawler->addContent($renderedData); $tableRows = $crawler->filter('tr'); /** * find the rendered data in attributes and tag content */ $tableRows->each(function ($node, $i) { $node->children()->each(function ($td, $tdi) use($i) { $this->assertEquals($this->data['rows'][$i]['cells'][$td->attr('class')], $td->text()); }); }); // count of root element from dataset should be equal rendered rows $this->assertEquals(count($this->data['rows']), $tableRows->count()); }
/** * Get useragents of the given bot * * @param [type] $botName [description] * @return void */ public function parseBotUA($botName) { $dom = $this->getDom('https://udger.com/resources/ua-list/bot-detail?bot=' . $botName); if (false === $dom) { echo "Can not parse DOM" . PHP_EOL; return false; } $this->currentBotName = $botName; $crawlerBot = new Crawler(); $crawlerBot->addContent($dom); $crawlerBot->filter('body #container table tr td > a')->each(function ($el, $i) { if (strpos($el->attr('href'), '/resources/online-parser') !== false) { $botUA = $el->text(); $this->addBotUA($botUA); } }); return true; }
/** * @covers Symfony\Component\DomCrawler\Crawler::addContent */ public function testAddContent() { $crawler = new Crawler(); $crawler->addContent('<html><div class="foo"></html>', 'text/html; charset=UTF-8'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addContent() adds nodes from an HTML string'); $crawler = new Crawler(); $crawler->addContent('<html><div class="foo"></html>'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addContent() uses text/html as the default type'); $crawler = new Crawler(); $crawler->addContent('<html><div class="foo"></div></html>', 'text/xml; charset=UTF-8'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addContent() adds nodes from an XML string'); $crawler = new Crawler(); $crawler->addContent('<html><div class="foo"></div></html>', 'text/xml'); $this->assertEquals('foo', $crawler->filter('div')->attr('class'), '->addContent() adds nodes from an XML string'); $crawler = new Crawler(); $crawler->addContent('foo bar', 'text/plain'); $this->assertEquals(0, count($crawler), '->addContent() does nothing if the type is not (x|ht)ml'); }
/** * Gets the server status for a given server identifier * * @param string $serverId You can find the server ID in your control panel * * @return MurmurStatusModel */ public function getStatus($serverId) { $murmurStatusModel = new MurmurStatusModel(); try { $statusPageContents = $this->fetchStatusPage($this->buildUrl($serverId)); $crawler = new Crawler(); $crawler->addContent($statusPageContents); $murmurStatusModel->setOnline(true); $murmurStatusModel->setConnectUrl("mumble://" . $crawler->filter('.value-address')->text()); $murmurStatusModel->setNumberOfClients((int) $crawler->filter('.value-clients .clients-avail')->text()); $murmurStatusModel->setNumberOfChannels((int) $crawler->filter('.value-channels')->text()); $murmurStatusModel->setMaxNumberOfClients((int) str_replace('/ ', '', $crawler->filter('.value-clients .clients')->text())); $murmurStatusModel->setRootChannel($this->parseChannelList($crawler->filter('.channel-list')->first())); } catch (\Exception $e) { $murmurStatusModel->setOnline(false); } return $murmurStatusModel; }
/** * @param string $html * @return \Lunchbot\Menu\LunchMenuItem[] */ public function parseHtml(string $html) : array { $crawler = new Crawler(); $crawler->addContent($html); $rows = $crawler->filter('div.entry-content table tr')->each(function (Crawler $node) { return $node; }); $todayDayOfWeek = (int) $this->today->format('N'); $currentDayBlock = false; /** @var Crawler $item */ $result = []; foreach ($rows as $item) { $headlineTag = $item->children(); $headline = $headlineTag->text(); $dayOfWeek = \Nette\Utils\Strings::match($headline, '~^(\\w+)\\s~u')[1] ?? null; if (isset(self::$DAYS[$dayOfWeek])) { if ($currentDayBlock) { break; // found tomorrow menu block } elseif (self::$DAYS[$dayOfWeek] === $todayDayOfWeek) { $currentDayBlock = true; } } elseif ($currentDayBlock) { if (\Nette\Utils\Strings::trim($headline) === 'Dezert') { break; } $result[] = new LunchMenuItem($headlineTag->siblings()->text()); } } return $result; }