protected function getGamesFromWeb($teamTournament) { //getting array of team aliases and participant id's $aliases = ArrayHelper::map($teamTournament, 'alias', 'id'); $count = $this->tournament->num_tours; //$html = new Document($this->tournament->autoProcessURL, true); $html = new Document('italy.htm', true); $table = $html->find('table.table.b-table-sortlist tbody')[0]; $j = 0; $gamesFromWeb = []; foreach ($table->find('tr') as $row) { $time = $this->autoTimeToUnix($row->find('td.sport__calendar__table__date')[0]->text()); if ($time > time() - 60 * 60 * 24 * 7 * 2) { $home = $row->find('td.sport__calendar__table__teams a.sport__calendar__table__team')[0]->text(); $guest = $row->find('td.sport__calendar__table__teams a.sport__calendar__table__team')[1]->text(); if (isset($aliases[$home]) && isset($aliases[$guest])) { $gamesFromWeb[$j]['tour'] = $row->find('td.sport__calendar__table__tour')[0]->text(); $gamesFromWeb[$j]['date_time_game'] = $this->autoTimeToUnix($row->find('td.sport__calendar__table__date')[0]->text()); $gamesFromWeb[$j]['id_team_home'] = (int) $aliases[$home]; $gamesFromWeb[$j]['id_team_guest'] = (int) $aliases[$guest]; $homeScore = $row->find('td.sport__calendar__table__result span.sport__calendar__table__result__left')[0]->text(); $guestScore = $row->find('td.sport__calendar__table__result span.sport__calendar__table__result__right')[0]->text(); $gamesFromWeb[$j]['score_home'] = $this->calculateHomeScore($homeScore); $gamesFromWeb[$j]['score_guest'] = $this->calculateHomeScore($guestScore); $j++; } else { throw new \Exception('Error during alias parsing ' . $home . ' or ' . $guest); } } } $this->gamesFromWeb = $gamesFromWeb; }
/** * Extract comments from DOM * * @param \DiDom\Document $dom * @return array */ public function extractComments($dom) { $comments = []; $nodes = $dom->find('.single-comment'); foreach ($nodes as $node) { $comment = []; $comment['text'] = $node->find('.text')[0]->text(); $comments[] = $comment; } return $comments; }
/** * @param string $url * @return array $this->data */ function parse_approvedevent($url) { $doc = new Document($url, true); $title = $doc->find('.descr_approvedevent')[0]->text(); $this->data['title'] = trim(preg_replace('/\\s{2,}/', '', $title)); //remove whitespaces foreach ($doc->find('div.caracs_approvedevent div') as $element) { $div_id = $element->attr('id'); switch ($div_id) { case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_OrganizingCompany_Panel': $el = $element->find('a')[0]->text(); $this->data['orginizing_company'] = trim(preg_replace('/\\s{2,}/', '', $el)); break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_OrganiserLocation_Panel': $el = $element->find('span')[0]->text(); $this->data['orginiser_location'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_BusinessSectors_Panel': $el = $element->find('span')[0]->text(); $this->data['business_sectors'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_Frequency_Panel': $el = $element->find('span')[0]->text(); $this->data['frequency'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_EventOpenTo_Panel': $el = $element->find('span')[0]->text(); $this->data['event_open_to'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_Email_Panel': $el = $element->find('a')[0]->text(); $this->data['email'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_Website_Panel': $el = $element->find('a')[0]->attr('href'); $this->data['website'] = $el; break; case 'ctl00_ContentPlaceHolder1_UFIApprovedEventsDetails1_Links_Panel': foreach ($element->find('a') as $key => $href) { $this->data['links'][$key] = $href->attr('href'); } break; } } $this->data['session'] = $this->parse_session($doc); return $this->data; }
protected function execute(InputInterface $input, OutputInterface $output) { $container = $this->getContainer(); $em = $container->get('doctrine')->getManager(); //$utils = $container->get('sbranch.common.utils'); $items = $em->getRepository("AppBundle:Item")->findTimed(); $output->writeln('Total count items ' . sizeof($items)); $progress = new ProgressBar($output, sizeof($items)); $progress->start(); foreach ($items as $item) { $progress->advance(); // if($item->getHtmlDocument()) { // $document = new Document(); // $document->loadHtml($item->getHtmlDocument()); // } else { // // } $document = new Document($item->getUrl(), true); $title = $document->find('title'); //TODO: this parsing is shitcode $piecesTitle = explode(":", $title[0]->text()); $piecesTitle2 = explode('-', trim(end($piecesTitle))); $priceString = trim($piecesTitle2[0]); $pricePieces = explode(' ', $priceString); $currency = trim(end($pricePieces)); $price = trim(str_replace($currency, '', $priceString)); $price = trim(str_replace(' ', '', $price)); $title = $piecesTitle[0]; $item->setTitle($title); if ($price != $item->getPrice()) { $item->setPriceOld($item->getPrice()); } $item->setPrice($price); $item->setCurrency($currency); $item->setLastCheck(new \DateTime()); $item->setHtmlDocument($document->html()); $em->persist($item); } $progress->finish(); $em->flush(); $output->writeln('Ok'); }
/** * Parses a kat list (a search, user uploads, etc..) * @param Document $html * @param int $maxAge timestamp * @return array */ public static function parseList($html, $maxAge = null) { $uploads = []; foreach ($html->find("tr.odd, tr.even") as $i => $rawUpload) { //echo $rawUpload->html();exit; $elements = $rawUpload->find("td.center"); $date = strtotime($rawUpload->find(".center[title]")[0]->attr("title")); if (!empty($maxAge) && $date < $maxAge) { continue; } $link = $rawUpload->find("a.cellMainLink")[0]; // the uploader may be anonymous $creator = null; if ($rawUpload->has(".lightgrey.block a.plain")) { $rawUser = $rawUpload->find(".lightgrey.block a.plain")[0]; $creator = new User(["nick" => $rawUser->text(), "link" => Kat::DOMAIN . $rawUser->attr("href")]); } $uploads[] = new BasicTorrent(["name" => $link->text(), "size" => $rawUpload->find(".nobr.center")[0]->text(), "created" => date("Y-m-d H:i:s", $date), "magnet" => $rawUpload->find("a[data-nop]")[0]->attr("href"), "torrent" => "https:" . $rawUpload->find("a[data-download]")[0]->attr("href"), "creator" => $creator, "files" => $elements[1]->text(), "seeders" => $elements[3]->text(), "leechers" => $elements[4]->text(), "link" => self::DOMAIN . $link->attr("href"), "comments" => (int) $rawUpload->find('div.iaconbox.center.floatright a')[0]->text()]); } return $uploads; }
public function testClass() { $html = ' <span class="odd first">Lorem ipsum dolor.</span> <span class="even second">Tenetur totam, nostrum.</span> <span class="odd third">Iste, doloremque, praesentium.</span> '; $document = new Document($html); $expected = ['Lorem ipsum dolor.', 'Iste, doloremque, praesentium.']; $result = []; foreach ($document->find('.odd') as $element) { $result[] = $element->text(); } $this->assertEquals($expected, $result); $expected = ['Iste, doloremque, praesentium.']; $result = []; foreach ($document->find('.odd.third') as $element) { $result[] = $element->text(); } $this->assertEquals($expected, $result); }
protected function getGamesFromWeb($teamTournament) { //getting array of team aliases and participant id's $aliases = ArrayHelper::map($teamTournament, 'alias', 'id'); $count = $this->tournament->num_tours; $j = 0; $html = new Document($this->tournament->autoProcessURL, true); //$html = new Document('pl.htm', true); $results = $html->find('div.mainPart')[0]; $gamesFromWeb = []; for ($i = 0; $i < $count; $i++) { if (isset($results->find('div.stat.mB15 table.stat-table')[$i])) { $tour = $html->find('h3.titleH3.bordered.mB10')[$i]->text(); $tour = $this->getTour($tour); $resultTable = $results->find('div.stat.mB15 table.stat-table')[$i]; foreach ($resultTable->find('tbody tr') as $k => $one) { if ($this->autoTimeToUnix($one->find('td.name-td')[0]->text()) > time() - 60 * 60 * 24 * 7 * 2 && $tour <= $count) { if (isset($one->find('td.owner-td a.player')[0]) && isset($one->find('td.guests-td a.player')[0])) { $owner = $one->find('td.owner-td a.player')[0]->text(); $guest = $one->find('td.guests-td a.player')[0]->text(); if (isset($aliases[$owner]) && isset($aliases[$guest])) { $gamesFromWeb[$j]['id_team_home'] = (int) $aliases[$owner]; $gamesFromWeb[$j]['id_team_guest'] = (int) $aliases[$guest]; $gamesFromWeb[$j]['date_time_game'] = (int) $this->autoTimeToUnix($one->find('td.name-td')[0]->text()); $gamesFromWeb[$j]['tour'] = $tour; $score = $one->find('td.score-td noindex')[0]->text(); $gamesFromWeb[$j]['score_home'] = $this->calculateHomeScore($score); $gamesFromWeb[$j]['score_guest'] = $this->calculateGuestScore($score); $j++; } else { throw new Exception('Error during alias parsing ' . $owner . ' or ' . $guest); } } } } } } $this->gamesFromWeb = $gamesFromWeb; }
public function findNextNodeByText(DiDomElement $element, $expression, $grep = false) { $childNodes = $element->getNode()->childNodes; $length = $childNodes->length; $stop = false; for ($i = 0; $i < $length; $i++) { $node = $childNodes->item($i); if ($stop && trim($node->textContent)) { $document = new DiDomDocument(); $document->appendChild($node); return $document; } else { if (!$grep && $expression == $node->textContent) { $stop = true; } else { if ($grep && preg_match($expression, $node->textContent)) { $stop = true; } } } } }
/** * @param $html * @return array */ public function load($html) { $metaTags = ['canonical' => '', 'robots' => '']; $this->crawler->loadHtml((string) $html); foreach ($this->crawler->find('meta') as $meta) { /** @var Element $meta */ $name = strtolower($meta->attr('name')); $content = $meta->attr('content'); $metaTags[$name] = $content; } $links = []; foreach ($this->crawler->find('a') as $link) { /** @var Element $link */ $rel = $link->attr('rel'); $href = $link->attr('href'); if ('nofollow' === strtolower($rel)) { continue; } $links[] = $href; } $this->links = array_unique($links); $this->metaTags = $metaTags; return ['links' => $links, 'meta' => $metaTags]; }
public function testGetDocument() { $html = $this->loadFixture('posts.html'); $document = new Document($html, false); $element = $document->createElement('span', 'value'); $this->assertEquals($document->getDocument(), $element->getDocument()->getDocument()); }
/** * Get the DOM document with the current element. * * @return \DiDom\Document */ public function toDocument() { $document = new Document(); $document->appendChild($this->domElement); return $document; }
/** * Indicates if two documents are the same document. * * @param Document|\DOMDocument $document The compared document * * @return bool * * @throws \InvalidArgumentException if the provided argument is not an instance of \DOMDocument or \DiDom\Document */ public function is($document) { if ($document instanceof self) { $element = $document->getElement(); } else { if (!$document instanceof DOMDocument) { throw new InvalidArgumentException(sprintf('Argument 1 passed to %s must be an instance of %s or DOMDocument, %s given', __METHOD__, __CLASS__, is_object($document) ? get_class($document) : gettype($document))); } $element = $document->documentElement; } if ($element === null) { return false; } return $this->getElement()->isSameNode($element); }
public function testReplace() { $html = '<ul><li>One</li><li>Two</li><li>Three</li></ul>'; $document = new Document($html, false); $first = $document->find('li')[0]; $third = $document->find('li')[2]; $this->assertEquals($first->getNode(), $first->replace($third)->getNode()); $this->assertEquals($third->getNode(), $document->find('li')[0]->getNode()); $this->assertCount(3, $document->find('li')); $document = new Document($html, false); $first = $document->find('li')[0]; $third = $document->find('li')[2]; $this->assertEquals($first->getNode(), $first->replace($third, false)->getNode()); $this->assertEquals($third->getNode(), $document->find('li')[0]->getNode()); $this->assertCount(2, $document->find('li')); }
public function testToStringXml() { $xml = $this->loadFixture('books.xml'); $document = new Document($xml, false, 'UTF-8', 'xml'); $this->assertEquals($document->xml(), $document->__toString()); }
function data($url) { $document = new Document($url, true); $a = $document->find('.game-info p'); return $a[1]->text(); }
/** * @param Document|\DOMDocument $document * @return bool * @throws \InvalidArgumentException */ public function is($document) { if ($document instanceof Document) { $element = $document->getElement(); } else { if (!$document instanceof DOMDocument) { throw new InvalidArgumentException(sprintf('Argument 1 passed to %s must be an instance of %s or %s, %s given', __METHOD__, __CLASS__, 'DOMDocument', gettype($document))); } $element = $document->documentElement; } return $this->getElement()->isSameNode($element); }
public function testParent() { $html = $this->loadFixture('posts.html'); $document = new Document($html, false); $element = $document->createElement('span', 'value'); $parent = $element->parent(); $this->assertInstanceOf('DiDom\\Document', $parent); $this->assertTrue($document->getElement()->isSameNode($parent->getElement())); }
/** * Gets the full movie crew divided in departments * @return array */ public function getCastCredits() { $content = new Document($this->getCredits()); $titles = $content->find("#fullcredits_content h4"); $persons = $content->find(".simpleCreditsTable tbody"); $crew = []; // skip useless h4s, (DiDom doesnt have :not pseudo class, so we make a foreach) foreach ($titles as $pos => $h4) { if ($h4->hasAttribute("id") || $h4->hasAttribute("name")) { unset($titles[$pos]); } } $titles = array_values($titles); foreach ($titles as $pos => $h4) { $title = trim($h4->text(), " \t\n\r\v "); switch ($title) { case "Directed by": $name = "director"; break; case "Music by": $name = "music"; break; case "Cinematography by": $name = "cinematography"; break; case "Film Editing by": $name = "editing"; break; case "Casting By": $name = "casting"; break; case "Production Design by": $name = "production_design"; break; case "Art Direction by": $name = "art_direction"; break; case "Set Decoration by": $name = "set_decoration"; break; case "Costume Design by": $name = "costume_design"; break; case "Makeup Department": $name = "makeup_department"; break; case "Production Management": $name = "production_management"; break; case "Art Department": $name = "art_department"; break; case "Sound Department": $name = "sound_department"; break; case "Special Effects by": $name = "special_effects"; break; case "Visual Effects by": $name = "visual_effects"; break; case "Stunts": $name = "stunts"; break; case "Camera and Electrical Department": $name = "camera_department"; break; case "Animation Department": $name = "animation_department"; break; case "Casting Department": $name = "casting_department"; break; case "Costume and Wardrobe Department": $name = "wardrobe_department"; break; case "Editorial Department": $name = "editorial_department"; break; case "Location Management": $name = "location_management"; break; case "Music Department": $name = "music_department"; break; case "Transportation Department": $name = "transportation_department"; break; case "Storyline": $name = "storyline"; break; case "Photo & Video": $name = "photo"; break; default: continue 2; break; } if (!isset($crew[$name])) { $crew[$name] = []; } $regex = "/name\\/nm(\\d+)\\/(?:.*)/"; if (!isset($persons[$pos])) { continue; } foreach ($persons[$pos]->find("a") as $person) { preg_match($regex, $person->attr("href"), $matches); if (!isset($matches[1]) || empty($matches[1])) { continue; } $crew[$name][] = ["id" => $matches[1], "name" => trim($person->text())]; } } return $crew; }
<?php require_once 'vendor/autoload.php'; use DiDom\Document; $document = new Document('http://www.opencart.com/index.php?route=extension/extension&filter_license=0', true); $extensions = $document->find('#content > div.extension-grid > div > div.name > a'); foreach ($extensions as $extension) { echo $extension->text(), "<br>"; }
public function testToElement() { $html = $this->loadFixture('posts.html'); $document = new Document($html, false); $element = $document->toElement(); $this->assertInstanceOf('DiDom\\Element', $element); }
public function testToString() { $html = $this->loadFixture('posts.html'); $document = new Document($html, false); $this->assertEquals($document->html(), $document->__toString()); }
public function testParent() { $document = new Document('', true); $element = $document->createElement('span', 'value'); $parent = $element->parent(); $this->assertInstanceOf('DiDom\\Document', $parent); }
/** * Searches for the element in the DOM tree. * * @param string $expression XPath expression or CSS selector * @param string $type the type of the expression * @param string $wrapList * * @return NodeList|DiDom\Element[] */ public function find($expression, $type = Query::TYPE_CSS, $wrapList = true) { $nodes = parent::find($expression, $type, false); return $wrapList ? new NodeList($nodes) : $nodes; }
/** * Get the DOM document with the current element. * * @param string $encoding The document encoding * * @return \DiDom\Document */ public function toDocument($encoding = 'UTF-8') { $document = new Document(null, false, $encoding); $document->appendChild($this->node); return $document; }
/** * Request callback * * @param callable $callback * @throws \Exception * @return Curl */ public function setCallback($callback) { if (!is_callable($callback)) { throw new \Exception(sprintf('Error: %s is not a valid callable', $callback)); } $http_response = $this->getHttpResponse(); $didom = new Document(); $dom = $didom->loadHtml($http_response); call_user_func_array($callback, [$http_response, $dom, $this]); return $this; }