Example #1
0
 /**
  * Converts provider output to db's input format
  *
  * @param QueryPath $html
  *
  * @return mixed (array/boolean)
  */
 public function parseItem($html)
 {
     $images = [];
     /*
         transform http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg/w_0/c_690x518/p_1/
         to        http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg
     */
     foreach ($html->find('#containerSlider img') as $img) {
         $src = $img->attr("data-src");
         if (empty($src)) {
             $src = $img->attr("src");
         }
         $path = explode(".jpg", $src);
         $images[] = $path[0] . ".jpg";
     }
     $data = ['title' => trim($html->find('.property-title')->text()), 'description' => trim($html->find('#ctl00_ddDescription .detail-section-content')->text()), 'images' => $images, 'location' => trim($html->find('.section.section--noBorder .detail-section-content')->text()), 'price' => $this->strToNumber($html->find('#priceContainer')->text()), 'meters' => $this->strToNumber($html->find('#litSurface b')->text()), 'floor' => (int) $html->find('#litFloor')->text(), 'url' => $html->find('link[rel="canonical"]')->attr("href")];
     foreach ($html->find('.detail-extras li') as $li) {
         $text = trim($li->text());
         switch ($text) {
             case "Ascensor":
                 $data["elevator"] = true;
                 break;
         }
     }
     if ($data["meters"] == 0 || empty($data["description"])) {
         return false;
     }
     return $data;
 }
Example #2
0
 /**
  * Converts provider output to db's input format
  *
  * @param QueryPath $html
  *
  * @return mixed (array/boolean)
  */
 public function parseItem($html)
 {
     $images = [];
     // get ch var from og image (required to display the images)
     $ogImage = $html->find('[name="og:image"]')->attr("content");
     if (empty($ogImage)) {
         return false;
     }
     parse_str(parse_url($ogImage)["query"], $query);
     $imageCh = $query["ch"];
     /*
         transform http://img3.idealista.com/thumbs,W,H,wi,+tSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2
         to        http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2BtSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2&ch=2106166706
     */
     foreach ($html->find('#main-multimedia img') as $img) {
         $image = str_replace("http://img3.idealista.com/thumbs,W,H,wi,+", "", $img->attr("data-service"));
         $images[] = "http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2B" . urlencode($image) . "&ch=" . $imageCh;
     }
     $title = trim($html->find('h1.txt-bold span')->text());
     $location = str_replace("Piso en venta en ", "", $title);
     $location = str_replace("Piso en alquiler en ", "", $location);
     $data = ['title' => $title, 'description' => trim($html->find('.adCommentsLanguage.expandable')->text()), 'images' => $images, 'location' => $location, 'price' => $this->strToNumber($html->find('#main-info .txt-big.txt-bold')->eq(0)->text()), 'url' => $html->find('#share-link')->attr("href")];
     foreach ($html->find('#fixed-toolbar .info-data > span') as $item) {
         $text = $item->text();
         $this->parseHouseInfo($text, $data);
     }
     if (!isset($data["meters"]) || $data["meters"] == 0 || empty($data["description"])) {
         return false;
     }
     return $data;
 }
Example #3
0
 /**
  * Given an XSLT stylesheet, run a transformation.
  *
  * This will attempt to read the provided stylesheet and then
  * execute it on the current source document.
  *
  * @param mixed $style
  *  This takes a QueryPath object or <em>any</em> of the types that the
  *  {@link qp()} function can take.
  * @return QueryPath
  *  A QueryPath object wrapping the transformed document. Note that this is a
  *  <i>different</em> document than the original. As such, it has no history.
  *  You cannot call {@link QueryPath::end()} to undo a transformation. (However,
  *  the original source document will remain unchanged.)
  */
 public function xslt($style)
 {
     if (!$style instanceof QueryPath) {
         $style = \QueryPath::with($style);
     }
     $sourceDoc = $this->src->top()->get(0)->ownerDocument;
     $styleDoc = $style->get(0)->ownerDocument;
     $processor = new \XSLTProcessor();
     $processor->importStylesheet($styleDoc);
     return \QueryPath::with($processor->transformToDoc($sourceDoc));
 }
 public function current()
 {
     if (!isset($this->qp)) {
         $this->qp = \QueryPath::with(parent::current(), NULL, $this->options);
     } else {
         $splos = new \SplObjectStorage();
         $splos->attach(parent::current());
         $this->qp->setMatches($splos);
     }
     return $this->qp;
 }
Example #5
0
 public static function htmlqp($document = NULL, $selector = NULL, $options = array())
 {
     self::$qp = false;
     //	I copied and modified the default options from the htmlqp method to provide a custom version for Amslib
     //	NOTE: Hmm....I'm not 100% sure this will work in all circumstances....
     $document = iconv("ISO8859-1", "UTF-8", $document);
     //	NOTE: see output buffer trick comment in qp()
     ob_start();
     self::$qp = QueryPath::withHTML($document, $selector, $options);
     $warnings = ob_get_clean();
     if (strlen($warnings)) {
         Amslib_Debug::log("FAILED TO OBTAIN CLEAN OUTPUT WHEN PROCESSING HTML: error = ", $warnings);
     }
     return self::$qp;
 }
 public function testAppendWithChildSelector()
 {
     $q = \QueryPath::with('<p>child content</p>');
     $this->atts['childViewContainer'] = ".container";
     $this->obj = new \JHM\Template($this->atts, $this->q);
     $atts = json_decode('{
       "id": "childItem",
       "tagName": "div",
       "attributes": {
         "className": "childitem"
       }
     }', true);
     $expected = '<div>This be the rendered content<div class="container"><div class="childitem"><p>child content</p></div></div></div>';
     $child = new \JHM\Template($atts, $q);
     $this->obj->appendChild($child);
     $this->assertEquals($expected, $this->obj->body());
 }
 private function extractTheatreMovieShowtimes($pageData, $limit, &$totalPages)
 {
     $startTime = microtime(true);
     \SystemLogger::debug("Begining extraction of data from file, size = ", strlen($pageData));
     if ($limit <= 0) {
         \SystemLogger::warn("Invalid limit was supplied: ", $limit);
         return array();
     }
     \SystemLogger::debug('Attempting to load into Query Path');
     /* @var $moviePage DOMQuery */
     $moviePage = \QueryPath::withHTML($pageData, null, array('convert_to_encoding' => "UTF-8", 'convert_from_encoding' => "UTF-8"));
     \SystemLogger::debug('Loaded into QueryPath');
     /* @var $theatersDom DOMQuery */
     $theatersDom = $moviePage->find("div.theater");
     //get total pages
     $paginationDom = $moviePage->find("#navbar td");
     $totalPages = $paginationDom->length ? $paginationDom->length - 2 : 1;
     \SystemLogger::debug("Found", $theatersDom->length, "theatres");
     $theatreCinemas = array();
     $foundTheatres = 0;
     \SystemLogger::debug('Loading data from Theatres DOM list');
     for ($i = 0; $i < $theatersDom->length && $foundTheatres < $limit; $i++) {
         $theatre = array();
         $theatreDom = new DOMQuery($theatersDom->get($i));
         $theatre['name'] = trim($theatreDom->find("h2.name")->first()->text());
         if (!$theatre['name']) {
             \SystemLogger::warn("Found no theatre at dom level: ", $i);
             continue;
         }
         \SystemLogger::debug("processing theatre: ", $theatre['name']);
         $addressText = $theatreDom->find(".info")->first()->text();
         //echo  $addressText, "<br>";
         $tmp = explode(" - ", trim($addressText));
         array_pop($tmp);
         $theatre['address'] = join(' ', $tmp);
         $theatreCinemas[] = array('theatre' => $theatre, 'movies' => $this->extractMovieShowtimes($theatreDom));
         $foundTheatres++;
     }
     \SystemLogger::info('Extraction done, completed in ', microtime(true) - $startTime, 'ms');
     return $theatreCinemas;
 }
Example #8
0
 /**
  * Converts provider output to db's input format
  *
  * @param QueryPath $html
  *
  * @return mixed (array/boolean)
  */
 public function parseItem($html)
 {
     $images = [];
     $data = ['title' => trim($html->find('h1.title')->text()), 'description' => trim($html->find('.description')->text()), 'price' => $this->strToNumber($html->find('.jsPrecioH1')->eq(0)->text()), 'url' => $html->find('link[rel="canonical"]')->attr("href")];
     foreach ($this->itemProps as $prop) {
         $propVal = $html->find('[itemprop="' . $prop . '"]')->attr("content");
         if (!empty($propVal)) {
             $data[$prop] = $propVal;
         }
     }
     // try to get the exact address
     $location = $html->find('[itemprop="streetAddress"]')->attr("content");
     if (empty($location)) {
         $location = $html->find('meta[itemprop="name"]')->attr("content");
         $location = str_replace("Piso en venta en ", "", $location);
         $location = str_replace("Piso en alquiler en ", "", $location);
     }
     $data['location'] = $location . ", " . $html->find('h2.position')->text();
     foreach ($html->find('.characteristics .item') as $item) {
         $text = $item->text();
         $this->parseHouseInfo($text, $data);
     }
     // skip retards that dont even fill the apartment meters
     if (!isset($data["meters"]) || $data["meters"] < 1) {
         return false;
     }
     /*
     from http://fotos.imghs.net/s/1030/129/1030_27926263129_1_2015112416580031250.jpg
     to http://fotos.imghs.net/xl/1030/129/1030_27926263129_1_2015112416580031250.jpg
     */
     foreach ($html->find("#basic img") as $img) {
         $image = str_replace(".net/s/", ".net/xl/", $img->attr("src"));
         // skip the default photos
         if (strpos($image, "nofoto_mini.jpg") !== false || strpos($image, "blank1x1.png") !== false || strpos($image, "Images/assets") !== false) {
             continue;
         }
         $images[] = $image;
     }
     if (sizeof($images) > 0) {
         $data["images"] = $images;
     }
     return $data;
 }
Example #9
0
 /**
  * Attach any items from the list if they match the selector.
  *
  * If no selector is specified, this will remove all current matches from
  * the document.
  *
  * @param QueryPath $dest
  *  A QueryPath Selector.
  * @return QueryPath
  *  The Query path wrapping a list of removed items.
  * @see replaceAll()
  * @see replaceWith()
  * @see removeChildren()
  * @since 2.1
  * @author eabrand
  */
 public function attach(QueryPath $dest)
 {
     foreach ($this->last as $m) {
         $dest->append($m);
     }
     return $this;
 }
Example #10
0
<?php

/*
 * Demo of UI interaction with jQuery+Uniter
 *
 * MIT license.
 */
$autoloader = (require __DIR__ . '/../../vendor/autoload.php');
$autoloader->add('Demo\\Tests\\', __DIR__);
QueryPath::enable('Demo\\QueryPath\\Extension\\DomEventExtension');
QueryPath::enable('Demo\\QueryPath\\Extension\\DomDataExtension');
Example #11
0
/**
 * A special-purpose version of {@link qp()} designed specifically for HTML.
 *
 * XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However,
 * because of the way that libxml handles HTML, there are several common steps that
 * need to be taken to reliably parse non-XML HTML documents. This function is
 * a convenience tool for configuring QueryPath to parse HTML.
 *
 * The following options are automatically set unless overridden:
 *  - ignore_parser_warnings: TRUE
 *  - convert_to_encoding: ISO-8859-1 (the best for the HTML parser).
 *  - convert_from_encoding: auto (autodetect encoding)
 *  - use_parser: html
 *
 * Parser warning messages are also suppressed, so if the parser emits a warning,
 * the application will not be notified. This is equivalent to
 * calling @code@qp()@endcode.
 *
 * Warning: Character set conversions will only work if the Multi-Byte (mb) library
 * is installed and enabled. This is usually enabled, but not always.
 *
 * @ingroup querypath_core
 * @see qp()
 */
function htmlqp($document = NULL, $selector = NULL, $options = array())
{
    return QueryPath::withHTML($document, $selector, $options);
}
Example #12
0
 /**
  * Internal recursive list generator for appendList.
  */
 protected function listImpl($items, $type, $opts, $q = NULL)
 {
     $ele = '<' . $type . '/>';
     if (!isset($q)) {
         $q = \QueryPath::with()->append($ele)->addClass($opts['list class']);
     }
     foreach ($items as $li) {
         if ($li instanceof QueryPath) {
             $q = $this->listImpl($li->get(), $type, $opts, $q);
         } elseif (is_array($li) || $li instanceof Traversable) {
             $q->append('<li><ul/></li>')->find('li:last > ul');
             $q = $this->listImpl($li, $type, $opts, $q);
             $q->parent();
         } else {
             $q->append('<li>' . $li . '</li>');
         }
     }
     return $q;
 }
Example #13
0
 /**
  * Converts provider output to db's input format
  *
  * @param QueryPath $html
  *
  * @return mixed (array/boolean)
  */
 public function parseItem($html)
 {
     $images = [];
     // doesnt have images or price
     if (!empty($html->find('.cajon-pedir-foto')->text()) || !empty($html->find('.pvpdesde')->text())) {
         return false;
     }
     $location = trim(preg_replace('/(\\v|\\s)+/', ' ', $html->find('.dir_ex.sprite')->text()));
     $description = trim($html->find('[itemprop="description"] p')->text());
     $data = ['title' => $html->find('.h1ficha')->text(), 'location' => $location, 'description' => $description, 'url' => $html->find('link[rel="canonical"]')->attr("href"), "price" => $this->strToNumber($html->find('[itemprop="price"]')->text())];
     $lastUpdate = trim($html->find('.actualizado.radius')->text());
     preg_match("/\\(([0-9\\/]+)\\)/", $lastUpdate, $matches);
     if (isset($matches[1])) {
         $data["lastUpdate"] = $matches[1];
     }
     foreach ($html->find('#inificha .bodis ul li') as $li) {
         $text = $li->text();
         if (strpos($text, " m2") !== false) {
             $data["meters"] = $this->strToNumber($li->find("span")->text());
         } else {
             if (strpos($text, "habitaciones") !== false) {
                 $data["rooms"] = (int) $text;
             }
         }
     }
     foreach ($html->find('.caracteristicas li') as $li) {
         $text = $li->text();
         if (strpos($text, ":") === false) {
             continue;
         }
         $info = explode(":", $text);
         switch (trim($info[0])) {
             case "Número de planta":
                 $data["floor"] = (int) $info[1];
                 break;
             case "Aire acondicionado":
                 $data["airConditioner"] = $this->stringToBool(trim($info[1]));
                 break;
             case "Calefacción":
                 $data["heating"] = $this->stringToBool(trim($info[1]));
                 break;
             case "Parking":
                 $data["parking"] = $this->stringToBool(trim($info[1]));
                 break;
             case "Ascensor":
                 $data["elevator"] = $this->stringToBool(trim($info[1]));
                 break;
             case "Amueblado":
                 $data["furnished"] = $this->stringToBool(trim($info[1]));
                 break;
         }
     }
     foreach ($html->find(".ficha_foto img") as $img) {
         $image = str_replace("G.jpg", "XL.jpg", $img->attr("src"));
         $images[] = $image;
     }
     if (sizeof($images) > 0) {
         $data["images"] = $images;
     }
     return $data;
 }
 protected function _getQueryObj($content)
 {
     $qp = \QueryPath::withHTML5(\QueryPath::HTML5_STUB);
     $qp->find('body')->append($content);
     return $qp;
 }
Example #15
0
 public static function setUpBeforeClass()
 {
     \QueryPath::enable('\\QueryPath\\Extension\\QPList');
 }
Example #16
0
 public function insertAfter(QueryPath $dest)
 {
     foreach ($this->matches as $m) {
         $dest->after($m);
     }
     return $this;
 }
Example #17
0
 public function testEnable()
 {
     \QueryPath::enable('\\QueryPath\\Tests\\DummyExtension');
     $qp = \QueryPath::with(\QueryPath::XHTML_STUB);
     $this->assertTrue($qp->grrrrrrr());
 }
Example #18
0
 /**
  * Branch the base DOMQuery into another one with the same matches.
  *
  * This function makes a copy of the DOMQuery object, but keeps the new copy
  * (initially) pointed at the same matches. This object can then be queried without
  * changing the original DOMQuery. However, changes to the elements inside of this
  * DOMQuery will show up in the DOMQuery from which it is branched.
  *
  * Compare this operation with {@link cloneAll()}. The cloneAll() call takes
  * the current DOMNode object and makes a copy of all of its matches. You continue
  * to operate on the same DOMNode object, but the elements inside of the DOMQuery
  * are copies of those before the call to cloneAll().
  *
  * This, on the other hand, copies <i>the DOMQuery</i>, but keeps valid
  * references to the document and the wrapped elements. A new query branch is
  * created, but any changes will be written back to the same document.
  *
  * In practice, this comes in handy when you want to do multiple queries on a part
  * of the document, but then return to a previous set of matches. (see {@link QPTPL}
  * for examples of this in practice).
  *
  * Example:
  *
  * @code
  * <?php
  * $qp = qp( QueryPath::HTML_STUB);
  * $branch = $qp->branch();
  * $branch->find('title')->text('Title');
  * $qp->find('body')->text('This is the body')->writeHTML;
  * ?>
  * @endcode
  *
  * Notice that in the code, each of the DOMQuery objects is doing its own
  * query. However, both are modifying the same document. The result of the above
  * would look something like this:
  *
  * @code
  * <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  * <html xmlns="http://www.w3.org/1999/xhtml">
  * <head>
  *    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"></meta>
  *    <title>Title</title>
  * </head>
  * <body>This is the body</body>
  * </html>
  * @endcode
  *
  * Notice that while $qp and $banch were performing separate queries, they
  * both modified the same document.
  *
  * In jQuery or a browser-based solution, you generally do not need a branching
  * function because there is (implicitly) only one document. In QueryPath, there
  * is no implicit document. Every document must be explicitly specified (and,
  * in most cases, parsed -- which is costly). Branching makes it possible to
  * work on one document with multiple DOMNode objects.
  *
  * @param string $selector
  *  If a selector is passed in, an additional {@link find()} will be executed
  *  on the branch before it is returned. (Added in QueryPath 2.0.)
  * @retval object DOMQuery
  *  A copy of the DOMQuery object that points to the same set of elements that
  *  the original DOMQuery was pointing to.
  * @since 1.1
  * @see cloneAll()
  * @see find()
  */
 public function branch($selector = NULL)
 {
     $temp = \QueryPath::with($this->matches, NULL, $this->options);
     //if (isset($selector)) $temp->find($selector);
     $temp->document = $this->document;
     if (isset($selector)) {
         $temp->findInPlace($selector);
     }
     return $temp;
 }
Example #19
0
 public function testEncodeDataURL()
 {
     $data = \QueryPath::encodeDataURL('Hi!', 'text/plain');
     $this->assertEquals('data:text/plain;base64,SGkh', $data);
 }
 /**
  *  @method:
  *    method used to convert output version into an array
  */
 public function convert_to_array()
 {
     $doc = QueryPath::withXML($this->output);
     $available_rdf = $this->get_rdf_list('rdf_term');
     $processed_data = array();
     $fields = array_keys($available_rdf);
     $items = $doc->children('bib|*');
     foreach ($items as $i => $item) {
         $processed_data[$i] = array();
         foreach ($fields as $field) {
             $processed_data[$i][$available_rdf[$field]] = $item->find(str_replace(':', '|', $field))->textImplode(', ');
         }
     }
     return $processed_data;
 }
/**
 *
 * @param QueryPath $qp
 * @return String
 */
function checkUnderline($qp)
{
    $qp->children("w|rPr");
    return $qp->children('w|u')->html() ? true : false;
}