withHTML() public static method

public static withHTML ( $source = null, $selector = null, $options = [] )
Exemplo n.º 1
0
 public static function htmlqp($document = NULL, $selector = NULL, $options = array())
 {
     self::$qp = false;
     //	I copied and modified the default options from the htmlqp method to provide a custom version for Amslib
     //	NOTE: Hmm....I'm not 100% sure this will work in all circumstances....
     $document = iconv("ISO8859-1", "UTF-8", $document);
     //	NOTE: see output buffer trick comment in qp()
     ob_start();
     self::$qp = QueryPath::withHTML($document, $selector, $options);
     $warnings = ob_get_clean();
     if (strlen($warnings)) {
         Amslib_Debug::log("FAILED TO OBTAIN CLEAN OUTPUT WHEN PROCESSING HTML: error = ", $warnings);
     }
     return self::$qp;
 }
 private function extractTheatreMovieShowtimes($pageData, $limit, &$totalPages)
 {
     $startTime = microtime(true);
     \SystemLogger::debug("Begining extraction of data from file, size = ", strlen($pageData));
     if ($limit <= 0) {
         \SystemLogger::warn("Invalid limit was supplied: ", $limit);
         return array();
     }
     \SystemLogger::debug('Attempting to load into Query Path');
     /* @var $moviePage DOMQuery */
     $moviePage = \QueryPath::withHTML($pageData, null, array('convert_to_encoding' => "UTF-8", 'convert_from_encoding' => "UTF-8"));
     \SystemLogger::debug('Loaded into QueryPath');
     /* @var $theatersDom DOMQuery */
     $theatersDom = $moviePage->find("div.theater");
     //get total pages
     $paginationDom = $moviePage->find("#navbar td");
     $totalPages = $paginationDom->length ? $paginationDom->length - 2 : 1;
     \SystemLogger::debug("Found", $theatersDom->length, "theatres");
     $theatreCinemas = array();
     $foundTheatres = 0;
     \SystemLogger::debug('Loading data from Theatres DOM list');
     for ($i = 0; $i < $theatersDom->length && $foundTheatres < $limit; $i++) {
         $theatre = array();
         $theatreDom = new DOMQuery($theatersDom->get($i));
         $theatre['name'] = trim($theatreDom->find("h2.name")->first()->text());
         if (!$theatre['name']) {
             \SystemLogger::warn("Found no theatre at dom level: ", $i);
             continue;
         }
         \SystemLogger::debug("processing theatre: ", $theatre['name']);
         $addressText = $theatreDom->find(".info")->first()->text();
         //echo  $addressText, "<br>";
         $tmp = explode(" - ", trim($addressText));
         array_pop($tmp);
         $theatre['address'] = join(' ', $tmp);
         $theatreCinemas[] = array('theatre' => $theatre, 'movies' => $this->extractMovieShowtimes($theatreDom));
         $foundTheatres++;
     }
     \SystemLogger::info('Extraction done, completed in ', microtime(true) - $startTime, 'ms');
     return $theatreCinemas;
 }
Exemplo n.º 3
0
/**
 * A special-purpose version of {@link qp()} designed specifically for HTML.
 *
 * XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However,
 * because of the way that libxml handles HTML, there are several common steps that
 * need to be taken to reliably parse non-XML HTML documents. This function is
 * a convenience tool for configuring QueryPath to parse HTML.
 *
 * The following options are automatically set unless overridden:
 *  - ignore_parser_warnings: TRUE
 *  - convert_to_encoding: ISO-8859-1 (the best for the HTML parser).
 *  - convert_from_encoding: auto (autodetect encoding)
 *  - use_parser: html
 *
 * Parser warning messages are also suppressed, so if the parser emits a warning,
 * the application will not be notified. This is equivalent to
 * calling @code@qp()@endcode.
 *
 * Warning: Character set conversions will only work if the Multi-Byte (mb) library
 * is installed and enabled. This is usually enabled, but not always.
 *
 * @ingroup querypath_core
 * @see qp()
 */
function htmlqp($document = NULL, $selector = NULL, $options = array())
{
    return QueryPath::withHTML($document, $selector, $options);
}