public static function htmlqp($document = NULL, $selector = NULL, $options = array()) { self::$qp = false; // I copied and modified the default options from the htmlqp method to provide a custom version for Amslib // NOTE: Hmm....I'm not 100% sure this will work in all circumstances.... $document = iconv("ISO8859-1", "UTF-8", $document); // NOTE: see output buffer trick comment in qp() ob_start(); self::$qp = QueryPath::withHTML($document, $selector, $options); $warnings = ob_get_clean(); if (strlen($warnings)) { Amslib_Debug::log("FAILED TO OBTAIN CLEAN OUTPUT WHEN PROCESSING HTML: error = ", $warnings); } return self::$qp; }
private function extractTheatreMovieShowtimes($pageData, $limit, &$totalPages) { $startTime = microtime(true); \SystemLogger::debug("Begining extraction of data from file, size = ", strlen($pageData)); if ($limit <= 0) { \SystemLogger::warn("Invalid limit was supplied: ", $limit); return array(); } \SystemLogger::debug('Attempting to load into Query Path'); /* @var $moviePage DOMQuery */ $moviePage = \QueryPath::withHTML($pageData, null, array('convert_to_encoding' => "UTF-8", 'convert_from_encoding' => "UTF-8")); \SystemLogger::debug('Loaded into QueryPath'); /* @var $theatersDom DOMQuery */ $theatersDom = $moviePage->find("div.theater"); //get total pages $paginationDom = $moviePage->find("#navbar td"); $totalPages = $paginationDom->length ? $paginationDom->length - 2 : 1; \SystemLogger::debug("Found", $theatersDom->length, "theatres"); $theatreCinemas = array(); $foundTheatres = 0; \SystemLogger::debug('Loading data from Theatres DOM list'); for ($i = 0; $i < $theatersDom->length && $foundTheatres < $limit; $i++) { $theatre = array(); $theatreDom = new DOMQuery($theatersDom->get($i)); $theatre['name'] = trim($theatreDom->find("h2.name")->first()->text()); if (!$theatre['name']) { \SystemLogger::warn("Found no theatre at dom level: ", $i); continue; } \SystemLogger::debug("processing theatre: ", $theatre['name']); $addressText = $theatreDom->find(".info")->first()->text(); //echo $addressText, "<br>"; $tmp = explode(" - ", trim($addressText)); array_pop($tmp); $theatre['address'] = join(' ', $tmp); $theatreCinemas[] = array('theatre' => $theatre, 'movies' => $this->extractMovieShowtimes($theatreDom)); $foundTheatres++; } \SystemLogger::info('Extraction done, completed in ', microtime(true) - $startTime, 'ms'); return $theatreCinemas; }
/** * A special-purpose version of {@link qp()} designed specifically for HTML. * * XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However, * because of the way that libxml handles HTML, there are several common steps that * need to be taken to reliably parse non-XML HTML documents. This function is * a convenience tool for configuring QueryPath to parse HTML. * * The following options are automatically set unless overridden: * - ignore_parser_warnings: TRUE * - convert_to_encoding: ISO-8859-1 (the best for the HTML parser). * - convert_from_encoding: auto (autodetect encoding) * - use_parser: html * * Parser warning messages are also suppressed, so if the parser emits a warning, * the application will not be notified. This is equivalent to * calling @code@qp()@endcode. * * Warning: Character set conversions will only work if the Multi-Byte (mb) library * is installed and enabled. This is usually enabled, but not always. * * @ingroup querypath_core * @see qp() */ function htmlqp($document = NULL, $selector = NULL, $options = array()) { return QueryPath::withHTML($document, $selector, $options); }