/** * Converts provider output to db's input format * * @param QueryPath $html * * @return mixed (array/boolean) */ public function parseItem($html) { $images = []; /* transform http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg/w_0/c_690x518/p_1/ to http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg */ foreach ($html->find('#containerSlider img') as $img) { $src = $img->attr("data-src"); if (empty($src)) { $src = $img->attr("src"); } $path = explode(".jpg", $src); $images[] = $path[0] . ".jpg"; } $data = ['title' => trim($html->find('.property-title')->text()), 'description' => trim($html->find('#ctl00_ddDescription .detail-section-content')->text()), 'images' => $images, 'location' => trim($html->find('.section.section--noBorder .detail-section-content')->text()), 'price' => $this->strToNumber($html->find('#priceContainer')->text()), 'meters' => $this->strToNumber($html->find('#litSurface b')->text()), 'floor' => (int) $html->find('#litFloor')->text(), 'url' => $html->find('link[rel="canonical"]')->attr("href")]; foreach ($html->find('.detail-extras li') as $li) { $text = trim($li->text()); switch ($text) { case "Ascensor": $data["elevator"] = true; break; } } if ($data["meters"] == 0 || empty($data["description"])) { return false; } return $data; }
/** * Converts provider output to db's input format * * @param QueryPath $html * * @return mixed (array/boolean) */ public function parseItem($html) { $images = []; // get ch var from og image (required to display the images) $ogImage = $html->find('[name="og:image"]')->attr("content"); if (empty($ogImage)) { return false; } parse_str(parse_url($ogImage)["query"], $query); $imageCh = $query["ch"]; /* transform http://img3.idealista.com/thumbs,W,H,wi,+tSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2 to http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2BtSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2&ch=2106166706 */ foreach ($html->find('#main-multimedia img') as $img) { $image = str_replace("http://img3.idealista.com/thumbs,W,H,wi,+", "", $img->attr("data-service")); $images[] = "http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2B" . urlencode($image) . "&ch=" . $imageCh; } $title = trim($html->find('h1.txt-bold span')->text()); $location = str_replace("Piso en venta en ", "", $title); $location = str_replace("Piso en alquiler en ", "", $location); $data = ['title' => $title, 'description' => trim($html->find('.adCommentsLanguage.expandable')->text()), 'images' => $images, 'location' => $location, 'price' => $this->strToNumber($html->find('#main-info .txt-big.txt-bold')->eq(0)->text()), 'url' => $html->find('#share-link')->attr("href")]; foreach ($html->find('#fixed-toolbar .info-data > span') as $item) { $text = $item->text(); $this->parseHouseInfo($text, $data); } if (!isset($data["meters"]) || $data["meters"] == 0 || empty($data["description"])) { return false; } return $data; }
/** * Given an XSLT stylesheet, run a transformation. * * This will attempt to read the provided stylesheet and then * execute it on the current source document. * * @param mixed $style * This takes a QueryPath object or <em>any</em> of the types that the * {@link qp()} function can take. * @return QueryPath * A QueryPath object wrapping the transformed document. Note that this is a * <i>different</em> document than the original. As such, it has no history. * You cannot call {@link QueryPath::end()} to undo a transformation. (However, * the original source document will remain unchanged.) */ public function xslt($style) { if (!$style instanceof QueryPath) { $style = \QueryPath::with($style); } $sourceDoc = $this->src->top()->get(0)->ownerDocument; $styleDoc = $style->get(0)->ownerDocument; $processor = new \XSLTProcessor(); $processor->importStylesheet($styleDoc); return \QueryPath::with($processor->transformToDoc($sourceDoc)); }
public function current() { if (!isset($this->qp)) { $this->qp = \QueryPath::with(parent::current(), NULL, $this->options); } else { $splos = new \SplObjectStorage(); $splos->attach(parent::current()); $this->qp->setMatches($splos); } return $this->qp; }
public static function htmlqp($document = NULL, $selector = NULL, $options = array()) { self::$qp = false; // I copied and modified the default options from the htmlqp method to provide a custom version for Amslib // NOTE: Hmm....I'm not 100% sure this will work in all circumstances.... $document = iconv("ISO8859-1", "UTF-8", $document); // NOTE: see output buffer trick comment in qp() ob_start(); self::$qp = QueryPath::withHTML($document, $selector, $options); $warnings = ob_get_clean(); if (strlen($warnings)) { Amslib_Debug::log("FAILED TO OBTAIN CLEAN OUTPUT WHEN PROCESSING HTML: error = ", $warnings); } return self::$qp; }
public function testAppendWithChildSelector() { $q = \QueryPath::with('<p>child content</p>'); $this->atts['childViewContainer'] = ".container"; $this->obj = new \JHM\Template($this->atts, $this->q); $atts = json_decode('{ "id": "childItem", "tagName": "div", "attributes": { "className": "childitem" } }', true); $expected = '<div>This be the rendered content<div class="container"><div class="childitem"><p>child content</p></div></div></div>'; $child = new \JHM\Template($atts, $q); $this->obj->appendChild($child); $this->assertEquals($expected, $this->obj->body()); }
private function extractTheatreMovieShowtimes($pageData, $limit, &$totalPages) { $startTime = microtime(true); \SystemLogger::debug("Begining extraction of data from file, size = ", strlen($pageData)); if ($limit <= 0) { \SystemLogger::warn("Invalid limit was supplied: ", $limit); return array(); } \SystemLogger::debug('Attempting to load into Query Path'); /* @var $moviePage DOMQuery */ $moviePage = \QueryPath::withHTML($pageData, null, array('convert_to_encoding' => "UTF-8", 'convert_from_encoding' => "UTF-8")); \SystemLogger::debug('Loaded into QueryPath'); /* @var $theatersDom DOMQuery */ $theatersDom = $moviePage->find("div.theater"); //get total pages $paginationDom = $moviePage->find("#navbar td"); $totalPages = $paginationDom->length ? $paginationDom->length - 2 : 1; \SystemLogger::debug("Found", $theatersDom->length, "theatres"); $theatreCinemas = array(); $foundTheatres = 0; \SystemLogger::debug('Loading data from Theatres DOM list'); for ($i = 0; $i < $theatersDom->length && $foundTheatres < $limit; $i++) { $theatre = array(); $theatreDom = new DOMQuery($theatersDom->get($i)); $theatre['name'] = trim($theatreDom->find("h2.name")->first()->text()); if (!$theatre['name']) { \SystemLogger::warn("Found no theatre at dom level: ", $i); continue; } \SystemLogger::debug("processing theatre: ", $theatre['name']); $addressText = $theatreDom->find(".info")->first()->text(); //echo $addressText, "<br>"; $tmp = explode(" - ", trim($addressText)); array_pop($tmp); $theatre['address'] = join(' ', $tmp); $theatreCinemas[] = array('theatre' => $theatre, 'movies' => $this->extractMovieShowtimes($theatreDom)); $foundTheatres++; } \SystemLogger::info('Extraction done, completed in ', microtime(true) - $startTime, 'ms'); return $theatreCinemas; }
/** * Converts provider output to db's input format * * @param QueryPath $html * * @return mixed (array/boolean) */ public function parseItem($html) { $images = []; $data = ['title' => trim($html->find('h1.title')->text()), 'description' => trim($html->find('.description')->text()), 'price' => $this->strToNumber($html->find('.jsPrecioH1')->eq(0)->text()), 'url' => $html->find('link[rel="canonical"]')->attr("href")]; foreach ($this->itemProps as $prop) { $propVal = $html->find('[itemprop="' . $prop . '"]')->attr("content"); if (!empty($propVal)) { $data[$prop] = $propVal; } } // try to get the exact address $location = $html->find('[itemprop="streetAddress"]')->attr("content"); if (empty($location)) { $location = $html->find('meta[itemprop="name"]')->attr("content"); $location = str_replace("Piso en venta en ", "", $location); $location = str_replace("Piso en alquiler en ", "", $location); } $data['location'] = $location . ", " . $html->find('h2.position')->text(); foreach ($html->find('.characteristics .item') as $item) { $text = $item->text(); $this->parseHouseInfo($text, $data); } // skip retards that dont even fill the apartment meters if (!isset($data["meters"]) || $data["meters"] < 1) { return false; } /* from http://fotos.imghs.net/s/1030/129/1030_27926263129_1_2015112416580031250.jpg to http://fotos.imghs.net/xl/1030/129/1030_27926263129_1_2015112416580031250.jpg */ foreach ($html->find("#basic img") as $img) { $image = str_replace(".net/s/", ".net/xl/", $img->attr("src")); // skip the default photos if (strpos($image, "nofoto_mini.jpg") !== false || strpos($image, "blank1x1.png") !== false || strpos($image, "Images/assets") !== false) { continue; } $images[] = $image; } if (sizeof($images) > 0) { $data["images"] = $images; } return $data; }
/** * Attach any items from the list if they match the selector. * * If no selector is specified, this will remove all current matches from * the document. * * @param QueryPath $dest * A QueryPath Selector. * @return QueryPath * The Query path wrapping a list of removed items. * @see replaceAll() * @see replaceWith() * @see removeChildren() * @since 2.1 * @author eabrand */ public function attach(QueryPath $dest) { foreach ($this->last as $m) { $dest->append($m); } return $this; }
<?php /* * Demo of UI interaction with jQuery+Uniter * * MIT license. */ $autoloader = (require __DIR__ . '/../../vendor/autoload.php'); $autoloader->add('Demo\\Tests\\', __DIR__); QueryPath::enable('Demo\\QueryPath\\Extension\\DomEventExtension'); QueryPath::enable('Demo\\QueryPath\\Extension\\DomDataExtension');
/** * A special-purpose version of {@link qp()} designed specifically for HTML. * * XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However, * because of the way that libxml handles HTML, there are several common steps that * need to be taken to reliably parse non-XML HTML documents. This function is * a convenience tool for configuring QueryPath to parse HTML. * * The following options are automatically set unless overridden: * - ignore_parser_warnings: TRUE * - convert_to_encoding: ISO-8859-1 (the best for the HTML parser). * - convert_from_encoding: auto (autodetect encoding) * - use_parser: html * * Parser warning messages are also suppressed, so if the parser emits a warning, * the application will not be notified. This is equivalent to * calling @code@qp()@endcode. * * Warning: Character set conversions will only work if the Multi-Byte (mb) library * is installed and enabled. This is usually enabled, but not always. * * @ingroup querypath_core * @see qp() */ function htmlqp($document = NULL, $selector = NULL, $options = array()) { return QueryPath::withHTML($document, $selector, $options); }
/** * Internal recursive list generator for appendList. */ protected function listImpl($items, $type, $opts, $q = NULL) { $ele = '<' . $type . '/>'; if (!isset($q)) { $q = \QueryPath::with()->append($ele)->addClass($opts['list class']); } foreach ($items as $li) { if ($li instanceof QueryPath) { $q = $this->listImpl($li->get(), $type, $opts, $q); } elseif (is_array($li) || $li instanceof Traversable) { $q->append('<li><ul/></li>')->find('li:last > ul'); $q = $this->listImpl($li, $type, $opts, $q); $q->parent(); } else { $q->append('<li>' . $li . '</li>'); } } return $q; }
/** * Converts provider output to db's input format * * @param QueryPath $html * * @return mixed (array/boolean) */ public function parseItem($html) { $images = []; // doesnt have images or price if (!empty($html->find('.cajon-pedir-foto')->text()) || !empty($html->find('.pvpdesde')->text())) { return false; } $location = trim(preg_replace('/(\\v|\\s)+/', ' ', $html->find('.dir_ex.sprite')->text())); $description = trim($html->find('[itemprop="description"] p')->text()); $data = ['title' => $html->find('.h1ficha')->text(), 'location' => $location, 'description' => $description, 'url' => $html->find('link[rel="canonical"]')->attr("href"), "price" => $this->strToNumber($html->find('[itemprop="price"]')->text())]; $lastUpdate = trim($html->find('.actualizado.radius')->text()); preg_match("/\\(([0-9\\/]+)\\)/", $lastUpdate, $matches); if (isset($matches[1])) { $data["lastUpdate"] = $matches[1]; } foreach ($html->find('#inificha .bodis ul li') as $li) { $text = $li->text(); if (strpos($text, " m2") !== false) { $data["meters"] = $this->strToNumber($li->find("span")->text()); } else { if (strpos($text, "habitaciones") !== false) { $data["rooms"] = (int) $text; } } } foreach ($html->find('.caracteristicas li') as $li) { $text = $li->text(); if (strpos($text, ":") === false) { continue; } $info = explode(":", $text); switch (trim($info[0])) { case "Número de planta": $data["floor"] = (int) $info[1]; break; case "Aire acondicionado": $data["airConditioner"] = $this->stringToBool(trim($info[1])); break; case "Calefacción": $data["heating"] = $this->stringToBool(trim($info[1])); break; case "Parking": $data["parking"] = $this->stringToBool(trim($info[1])); break; case "Ascensor": $data["elevator"] = $this->stringToBool(trim($info[1])); break; case "Amueblado": $data["furnished"] = $this->stringToBool(trim($info[1])); break; } } foreach ($html->find(".ficha_foto img") as $img) { $image = str_replace("G.jpg", "XL.jpg", $img->attr("src")); $images[] = $image; } if (sizeof($images) > 0) { $data["images"] = $images; } return $data; }
protected function _getQueryObj($content) { $qp = \QueryPath::withHTML5(\QueryPath::HTML5_STUB); $qp->find('body')->append($content); return $qp; }
public static function setUpBeforeClass() { \QueryPath::enable('\\QueryPath\\Extension\\QPList'); }
public function insertAfter(QueryPath $dest) { foreach ($this->matches as $m) { $dest->after($m); } return $this; }
public function testEnable() { \QueryPath::enable('\\QueryPath\\Tests\\DummyExtension'); $qp = \QueryPath::with(\QueryPath::XHTML_STUB); $this->assertTrue($qp->grrrrrrr()); }
/** * Branch the base DOMQuery into another one with the same matches. * * This function makes a copy of the DOMQuery object, but keeps the new copy * (initially) pointed at the same matches. This object can then be queried without * changing the original DOMQuery. However, changes to the elements inside of this * DOMQuery will show up in the DOMQuery from which it is branched. * * Compare this operation with {@link cloneAll()}. The cloneAll() call takes * the current DOMNode object and makes a copy of all of its matches. You continue * to operate on the same DOMNode object, but the elements inside of the DOMQuery * are copies of those before the call to cloneAll(). * * This, on the other hand, copies <i>the DOMQuery</i>, but keeps valid * references to the document and the wrapped elements. A new query branch is * created, but any changes will be written back to the same document. * * In practice, this comes in handy when you want to do multiple queries on a part * of the document, but then return to a previous set of matches. (see {@link QPTPL} * for examples of this in practice). * * Example: * * @code * <?php * $qp = qp( QueryPath::HTML_STUB); * $branch = $qp->branch(); * $branch->find('title')->text('Title'); * $qp->find('body')->text('This is the body')->writeHTML; * ?> * @endcode * * Notice that in the code, each of the DOMQuery objects is doing its own * query. However, both are modifying the same document. The result of the above * would look something like this: * * @code * <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> * <html xmlns="http://www.w3.org/1999/xhtml"> * <head> * <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta> * <title>Title</title> * </head> * <body>This is the body</body> * </html> * @endcode * * Notice that while $qp and $banch were performing separate queries, they * both modified the same document. * * In jQuery or a browser-based solution, you generally do not need a branching * function because there is (implicitly) only one document. In QueryPath, there * is no implicit document. Every document must be explicitly specified (and, * in most cases, parsed -- which is costly). Branching makes it possible to * work on one document with multiple DOMNode objects. * * @param string $selector * If a selector is passed in, an additional {@link find()} will be executed * on the branch before it is returned. (Added in QueryPath 2.0.) * @retval object DOMQuery * A copy of the DOMQuery object that points to the same set of elements that * the original DOMQuery was pointing to. * @since 1.1 * @see cloneAll() * @see find() */ public function branch($selector = NULL) { $temp = \QueryPath::with($this->matches, NULL, $this->options); //if (isset($selector)) $temp->find($selector); $temp->document = $this->document; if (isset($selector)) { $temp->findInPlace($selector); } return $temp; }
public function testEncodeDataURL() { $data = \QueryPath::encodeDataURL('Hi!', 'text/plain'); $this->assertEquals('data:text/plain;base64,SGkh', $data); }
/** * @method: * method used to convert output version into an array */ public function convert_to_array() { $doc = QueryPath::withXML($this->output); $available_rdf = $this->get_rdf_list('rdf_term'); $processed_data = array(); $fields = array_keys($available_rdf); $items = $doc->children('bib|*'); foreach ($items as $i => $item) { $processed_data[$i] = array(); foreach ($fields as $field) { $processed_data[$i][$available_rdf[$field]] = $item->find(str_replace(':', '|', $field))->textImplode(', '); } } return $processed_data; }
/** * * @param QueryPath $qp * @return String */ function checkUnderline($qp) { $qp->children("w|rPr"); return $qp->children('w|u')->html() ? true : false; }