/** * Get Captcha Value and input in box */ function captcha($imglocation, $typeinbox) { $html = $this->getSource(); $tidy = tidy_parse_string($html)->html()->value; $searchqp = htmlqp($tidy, 'body'); $captchaurl = $searchqp->branch($imglocation)->attr('src'); $saveimg = '/tmp/mycaptcha.png'; file_put_contents($saveimg, file_get_contents($captchaurl)); $tesseract = new TesseractOCR($saveimg); $crackedvalue = $tesseract->recognize(); $this->driver->findElement(WebDriverBy::CssSelector($typeinbox))->sendKeys($crackedvalue); }
/** * @param string $name * @return string */ public function extractValue($name) { if (isset($this->extractedValues[$name])) { return $this->extractedValues[$name]; } $mapping = isset($this->itemMapping[$name]) ? $this->itemMapping[$name] : $name; if (is_string($mapping)) { $mapping = array('selector' => $mapping); } $value = !empty($mapping['defaultValue']) ? $mapping['defaultValue'] : ''; if (empty($mapping['selector']) && empty($mapping['defaultValue'])) { throw new \RuntimeException('Missing \'selector\' or \'defaultValue\' for ' . htmlentities($name) . ' mapping'); } if (!empty($mapping['selector'])) { if (!empty($mapping['source'])) { $source = $this->extractorService->extractValue($this->item, $mapping['source']); $source = $this->extractorService->fetchRawContent($source); try { $item = qp($source); } catch (\QueryPath\Exception $e) { $item = htmlqp($source); } } else { $item = $this->item; } $value = $this->extractorService->extractValue($item, $mapping, $value); } $this->extractedValues[$name] = $value; return $this->extractedValues[$name]; }
protected function processRawData($data_raw) { $data = []; foreach ($data_raw as $html_raw) { $td_array = htmlqp($html_raw)->find('.table-striped td')->get(); foreach ($td_array as $td) { $data[] = trim(htmlqp($td)->text()); } } return $data; }
protected function parseArticle(\GuzzleHttp\Client $http_client, $base_uri, $section, $section_uri, $full_uri, $article_info) { if (!preg_match('/^https?:\\/\\//', $article_info)) { $article_info = rtrim($base_uri, '/') . $article_info; } $article_html = $http_client->get($article_info)->getBody()->getContents(); $article_html = mb_convert_encoding($article_html, 'UTF-8', 'WINDOWS-1251'); $article_object = htmlqp($article_html, null, ['convert_to_encoding' => 'UTF-8']); $title = $article_object->find('div.main-container div.content article h1')->get(0); $text = $article_object->find('div.main-container div.content div.article-content p')->get(0); $title_string = $title ? trim(htmlqp($title)->text()) : ''; $text_string = $text ? trim(htmlqp($text)->text()) : ''; $title_string = mb_convert_encoding($title_string, 'WINDOWS-1251', 'UTF-8'); $text_string = mb_convert_encoding($text_string, 'WINDOWS-1251', 'UTF-8'); return new Article($title_string, $text_string, $article_info); }
protected function processItems() { //ensure that we are not appending to old data (i.e. if this method is called more than once) $this->SetPostingsToEmpty(); $entirepage = htmlqp($this->GetRequestData()); $metadata = $entirepage->find('.documentBottomLine')->children('.documentByLine'); $author = $this->getAuthor($metadata->text()); $date = $this->convertDate($metadata->text()); $link = $this->source; $items = $entirepage->find('.documentContent')->find('div#bodyContent.plain')->children('h2'); foreach ($items as $item) { if ($this->isPseudoInfo($item->text())) { // skip pseudo "information" continue; } $text = $this->tidyText($this->prependText($item->text())); $this->AppendToPostings($date, $author, $text, $link); } }
public function extractDataFromHtml($html) { $definitions = array(); /** @var \QueryPath $qp */ $qp = htmlqp($html, '.super_group', array('ignore_parser_warnings' => true)); /** @var \QueryPath\DOMQuery $group */ foreach ($qp as $group) { if (strpos($group->attr('id'), "query-") === 0) { $title = $group->find('h2')->text(); $definitions[$title] = array(); $urlPath = $group->find('.method_details_list .url')->text(); $method = $group->find('.method_details_list .action')->text(); $tables = $group->find('.parameters > .data_table'); $objectFields = $this->extractFieldsFromTables($tables); $definitions[$title]['fields'] = $objectFields; $definitions[$title]['method'] = $method; $definitions[$title]['urlPath'] = $urlPath; } } return $definitions; }
public function scrape() { $cinema_url = $this->get_url(); $cinema = $this->get_cinema(); $html = htmlqp($cinema_url); $theater = $html->find('h2')->text(); $address = $html->top()->find('div.info', 0)->text(); foreach ($html->top()->find('.movie') as $movie) { $filme = new Movie(); $filme->name = $movie->children('.name a')->text(); $meta = $movie->parent()->children('.info')->text(); $this->set_movie_meta($meta, $filme); $showtimes = $movie->parent()->find('.times')->text(); $showtimes = explode("Â ", htmlentities($showtimes)); foreach ($showtimes as $showtime) { $filme->set_showtime($showtime); } $cinema->set_movie($filme); } return $cinema; }
protected function processRawData($data_raw) { $proxies = []; foreach ($data_raw as $item) { $item_type = $item[0]; $html_raw = $item[1]; $tr_array = htmlqp($html_raw)->find('table#proxylisttable tbody tr')->get(); foreach ($tr_array as $tr) { $td_array = htmlqp($tr)->find('td')->get(); $properties = array_map(function ($td) { return trim(htmlqp($td)->text()); }, $td_array); $proxy = ['protocol' => null, 'ip' => null, 'port' => null, 'country_code' => null, 'anonymity' => null, 'google' => null, 'https' => null]; switch ($item_type) { case FreeProxyListNetBackend::TYPE_HTTP: $proxy['protocol'] = 'http'; $proxy['ip'] = $properties[self::COLUMN_HTTP_IP]; $proxy['port'] = (int) $properties[self::COLUMN_HTTP_PORT]; $proxy['country_code'] = $properties[self::COLUMN_HTTP_COUNTRY_CODE]; $proxy['anonymity'] = $properties[self::COLUMN_HTTP_ANONYMITY]; $proxy['google'] = 'yes' === strtolower($properties[self::COLUMN_HTTP_GOOGLE]); $proxy['https'] = 'yes' === strtolower($properties[self::COLUMN_HTTP_HTTPS]); break; case FreeProxyListNetBackend::TYPE_SOCKS: $proxy['ip'] = $properties[self::COLUMN_SOCKS_IP]; $proxy['port'] = (int) $properties[self::COLUMN_SOCKS_PORT]; $proxy['country_code'] = $properties[self::COLUMN_SOCKS_COUNTRY_CODE]; $proxy['anonymity'] = $properties[self::COLUMN_SOCKS_ANONYMITY]; $proxy['google'] = false; $proxy['https'] = 'yes' === strtolower($properties[self::COLUMN_SOCKS_HTTPS]); $proxy['protocol'] = 'socks5' === strtolower($properties[self::COLUMN_SOCKS_VERSION]) ? 'socks5' : 'socks4'; break; default: throw new ProxyProviderException('Raw data is invalid'); } $proxies[] = $proxy; } } return $proxies; }
protected function processItems() { //ensure that we are not appending to old data (i.e. if this method is called more than once) $this->SetPostingsToEmpty(); //process the actual data $items = htmlqp($this->GetRequestData(), '#tudevent_box', $this->overrideEncoding())->find('.portletContent'); foreach ($items as $item) { if ($item->children('.tudeventlist-eventdate')->count() == 1 and $item->children('.tudeventlist-linkedtext')->count() == 1) { $date = $item->children('.tudeventlist-eventdate')->text(); if (($date = $this->getDates($date)) === false) { //some strange event without any date given; skip it continue; } $link = $item->children('.tudeventlist-linkedtext')->children('a')->attr('href'); $text = $item->children('.tudeventlist-linkedtext')->text(); $text = $this->tidyText($this->prependText($text)); //TODO feature enhancement: we could determine the author via subsequent calls to the linked calendar entries. //but since we are not printing the author anyways, this is currently not implemented $author = "n/a"; $this->AppendToPostings($date, $author, $text, $link); } } }
* @author M Butcher <*****@*****.**> * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license. */ /** Include QueryPath. */ require_once '../src/QueryPath/QueryPath.php'; /** * Check if the string 'Release' is in the text content of any matched nodes. * * Returns TRUE if the text is found, FALSE otherwise. Anytime a filter callback * returns FALSE, QueryPath will remove it from the matches. * * Note that $item is a DOMNode (actually, a DOMElement). So if we wanted to do QueryPath * manipulations on it, we could wrap it in a `qp()`. */ function exampleCallback($index, $item) { $text = qp($item)->text(); return strpos($text, 'Release') !== FALSE; } /* * This is the QueryPath call. * * First we fetch the remote page, parse it, and grab just the `a` tags inside of the summary. * Then we filter the results through our callback. * Finally, we fetch all of the matching text and print it. * * NOTE: If you are using PHP 5.3, you can define the callback inline instead of separating it * into a stand-alone function. */ print htmlqp('http://php.net/', 'h1.summary a')->filterCallback('exampleCallback')->textImplode(PHP_EOL);
/** * Attach image to the post id. */ private function _attach_image($the_post_id) { // get the image source $image_src = htmlqp($this->_my_article->content, '.thumbinner img')->attr("src"); // download the image and put in the if (isset($image_src)) { $image_src = str_replace("220px-", '640px-', $image_src); if (file_exists($image_src)) { $this->_save_image("http:" . $image_src, $the_post_id); } else { $image_src = str_replace("640px-", '220px-', $image_src); $this->_save_image("http:" . $image_src, $the_post_id); } } }
private function _filterProject($data) { $project = false; // get the important or filterd values only $arr = array(); $arr['id'] = $data['id']; $arr['name'] = $data['name']; $arr['description'] = $data['description']; $imgxl = max(array_keys($data['covers'])); $arr['cover'] = $this->_cacheImage($data['covers'][$imgxl], $data['id'], 'covers'); $arr['url'] = $data['url']; $arr['published'] = $data['published_on']; $arr['fields'] = $data['fields']; $arr['tags'] = $data['tags']; // querypath comes in $index = 0; foreach ($data['modules'] as $module) { if ($module['type'] == "text") { $string = $module['text']; $qp = htmlqp($string); // change into h2 elements $div = $qp->find('span[style="font-size: 22px;"]')->parent('div'); $color = $div->find('span[style~="color:"]')->attr('style'); $inner = $div->text(); if (!$inner) { $div = $qp->find('span[style="font-size:22px;"]')->parent('div'); $color = $div->find('span[style~="color:"]')->attr('style'); $inner = $div->text(); } $div->html('<h2 style="' . $color . '">' . $inner . '</h2>'); // change bold titles/subtitles into h3 elements foreach ($qp->find('span.bold')->parent('div') as $div) { $newdiv = $div->find('span[style="font-size: 12px;"]')->parent('div'); if (is_null($newdiv)) { $newdiv = $div->find('span[style="font-size:12px;"]')->parent('div'); } if (!is_null($newdiv)) { $inner = $div->text(); $color = $div->find('span[style~="color:"]')->attr('style'); $div->html('<h3 style="' . $color . '">' . $inner . '</h3>'); } } $markup = strip_tags($qp->html(), '<div><h2><h3><span><a><p>'); unset($data['modules'][$index]['text_plain']); $data['modules'][$index]['text'] = $markup; } elseif ($module['type'] == "image") { unset($data['modules'][$index]['sizes']); $data['modules'][$index]['src'] = $this->_cacheImage($module['src'], $module['id'], 'images'); } // next module $index++; } $arr['modules'] = $data['modules']; // array_push($project, $arr); $project = $arr; return $project; }
/** * Test alternate constructors. * @group basic */ public function testDOMQueryHtmlConstructors() { $qp = htmlqp(\QueryPath::HTML_STUB); $this->assertEquals(1, count($qp->get())); $this->assertTrue($qp->get(0) instanceof \DOMNode); // Bad BR tag. $borken = '<html><head></head><body><br></body></html>'; $qp = htmlqp($borken); $this->assertEquals(1, count($qp->get())); $this->assertTrue($qp->get(0) instanceof \DOMNode); // XHTML Faker $borken = '<?xml version="1.0"?><html><head></head><body><br></body></html>'; $qp = htmlqp($borken); $this->assertEquals(1, count($qp->get())); $this->assertTrue($qp->get(0) instanceof \DOMNode); // HTML in a file that looks like XML. $qp = htmlqp(HTML_IN_XML_FILE); $this->assertEquals(1, count($qp->get())); $this->assertTrue($qp->get(0) instanceof \DOMNode); // HTML5 $html5 = new \Masterminds\HTML5(); $dom = $html5->loadHTML(\QueryPath::HTML_STUB); qp($dom, 'html'); // Stripping #13 (CR) from HTML. $borken = '<html><head></head><body><p>' . chr(13) . '</p><div id="after"/></body></html>'; $this->assertFalse(strpos(htmlqp($borken)->html(), ' '), 'Test that CRs are not encoded.'); // Regression for #58: Make sure we aren't getting encoded. $borken = '<html><head><style> .BlueText { color:red; }</style><body></body></html>'; $this->assertFalse(strpos(htmlqp($borken)->html(), ' '), 'Test that LF is not encoded.'); // Low ASCII in a file $borken = '<html><head></head><body><p>' . chr(27) . '</p><div id="after"/></body></html>'; $this->assertEquals(1, htmlqp($borken, '#after')->size()); }
function jjamerson_preprocess_block(&$variables) { /* Add an item count to the menu itself */ if ($variables['block']->module === 'menu' || $variables['block_html_id'] === 'block-system-main-menu' || $variables['block']->module === 'berklee_site_section' || strpos($variables['block']->css_class, 'main-menu-block') > -1 && isset($variables['elements'])) { if (isset($variables['block']->subject) && $variables['block']->subject > '') { $aria_label = "aria-label='" . strip_tags($variables['block']->subject) . "'"; } else { $aria_label = ''; } $page_menus[$variables['block_html_id']] = $variables['block_html_id']; $counter = 0; foreach ($variables['elements'] as $element) { if (is_array($element) && isset($element['#original_link'])) { $counter++; } } // This may be a block where the menu is already rendered in the content region. If so, // we parse the content region for parent-level list items. if ($counter === 0 && isset($variables['content']) && in_array('main-menu-block', $variables['classes_array'])) { // we'll use querypath to parse. https://www.drupal.org/project/querypath | http://querypath.org/ if (function_exists('htmlqp')) { try { $content_qp = htmlqp($variables['content']); $child_menus = $content_qp->remove('ul ul'); $content_qp->top('ul'); $counter = $content_qp->find('li')->length; } catch (Exception $e) { } } } if ($counter > 0) { /* Add it both as a class and as an attribute. The attribute is easier to grab & work with in JS. */ $variables['classes_array'][] = 'item-count-' . $counter; $variables['attributes_array']['item-count'] = $counter; } $variables['content'] = "<nav role='navigation' {$aria_label}>" . $variables['content'] . '</nav>'; } }
<?php /** * Urban Dictionary Random Word Generator * * * @author Emily Brand * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license. * @see http://www.urbandictionary.com/ */ require_once '../src/QueryPath/QueryPath.php'; print '<h3>Urban Dictionary Random Word Generator</h3>'; $page = rand(0, 288); $qp = htmlqp('http://www.urbandictionary.com/?page=' . $page, '#home'); $rand = rand(0, 7); print $qp->find('.word')->eq($rand)->text() . '<br />'; print $qp->top()->find('.definition')->eq($rand)->text();
static function get_post($url) { require_once APPPATH . 'third_party/querypath-3.0.4/src/qp.php'; libxml_use_internal_errors(true); $qp = htmlqp($url); $data = array('url' => $url); //title $title = $qp->find("meta[property='og:title']"); if ($title->count()) { $data['title'] = $title->attr('content'); } else { $title = $qp->find("meta[property='twitter:title']"); if ($title->count()) { $data['title'] = $title->attr('content'); } else { $data['title'] = $qp->find("title")->text(); } } //image $image = $qp->find("meta[property='og:image']"); if ($image->count()) { $data['image'] = $image->attr('content'); } else { $image = $qp->find("meta[property='twitter:image:src']"); if ($image->count()) { $data['image'] = $image->attr('content'); } } //save images if ($data['image']) { $original_url = $data['image']; $md5 = substr(md5($original_url . mt_rand()), 0, 12); $local_dir = substr($md5, 0, 2) . DIRECTORY_SEPARATOR . substr($md5, 2, 2) . DIRECTORY_SEPARATOR; $local = $local_dir . substr($md5, 4); $url = substr($md5, 0, 2) . '/' . substr($md5, 2, 2) . '/' . substr($md5, 4); $path = $original_url; $qpos = strpos($path, "?"); if ($qpos !== false) { $path = substr($path, 0, $qpos); } $extension = pathinfo($path, PATHINFO_EXTENSION); if ($extension != "") { $local .= "." . $extension; $url .= "." . $extension; } $local = FCPATH . "images" . DIRECTORY_SEPARATOR . "cache" . DIRECTORY_SEPARATOR . $local; $local_dir = FCPATH . "images" . DIRECTORY_SEPARATOR . "cache" . DIRECTORY_SEPARATOR . $local_dir; $url = "/images/cache/" . $url; !is_dir($local_dir) && mkdir($local_dir, 0777, TRUE); copy($original_url, $local); if (file_exists($local)) { $data['image'] = $url; } else { unset($data['image']); } } //Date $date = $qp->find("meta[property='article:published_time']"); if ($date->count() > 1) { $data['date_published'] = $date->attr('content'); } else { $date = $qp->find('time'); if ($date->count() == 1) { if ($date->attr('datetime')) { $data['date_published'] = $date->attr('datetime'); } else { $data['date_published'] = $date->text(); } } else { $date = $qp->find('article time'); if ($date->count() == 1) { if ($date->attr('datetime')) { $data['date_published'] = $date->attr('datetime'); } else { $data['date_published'] = $date->text(); } } else { $date = $qp->find('article header time'); if ($date->count() == 1) { if ($date->attr('datetime')) { $data['date_published'] = $date->attr('datetime'); } else { $data['date_published'] = $date->text(); } } else { } } } } $author = $qp->find(".author"); if ($author->count() == 1) { $data['author'] = $author->text(); } if (!isset($data['author'])) { $author = $qp->find("meta[name=author]"); if ($author->count() == 1) { $data['author'] = $author->attr('content'); } } if (!isset($data['author'])) { $author = $qp->find("*[class*='author']"); $classes = []; foreach ($author->get() as $el) { $classes[] = $el->getAttribute('class'); } $classes = array_count_values(array_map('strtolower', $classes)); foreach ($classes as $class => $count) { if ($count == 1) { $data['author'] = $qp->find('.' . str_replace(' ', '.', $class))->text(); break; } } } return $data; }
/** * @param string $string * @return \QueryPath\DOMQuery */ public function stringToDOMQuery($string) { try { $domQuery = qp($string); } catch (\QueryPath\Exception $e) { $domQuery = htmlqp($string); } return $domQuery; }
<?php /** * Basic example of QueryPath usage. * * This two-line example exhibits basic use of QueryPath. It creates a new * HTML document and adds the typical 'Hello World' text to the body. It then writes * that information to standard out (which is flushed to a web browser in most cases.) * * The important methods covered here are {@link qp()}, which is the {@link QueryPath} * factory function, {@link QueryPath::find()}, which is the primary searching * function, and {@link QueryPath::writeHTML()}, which is a utility function. * * This file is fully explained in the official QueryPath tutorial, located * at {@link https://fedorahosted.org/querypath/wiki/QueryPathTutorial} * * * @author M Butcher <*****@*****.**> * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license. * @see qp() * @see QueryPath::find() * @see QueryPath::writeHTML() * @see html.php * @see https://fedorahosted.org/querypath/wiki/QueryPathTutorial The Official Tutorial */ require_once '../src/QueryPath/QueryPath.php'; qp(QueryPath::HTML_STUB)->find('body')->text('Hello World')->writeHTML(); $qp = htmlqp(QueryPath::HTML_STUB, 'body'); $qp->append('<div></div><p id="cool">Hello</p><p id="notcool">Goodbye</p>')->children('p')->after('<p id="new">new paragraph</p>'); echo $qp->find('p')->children('p')->html() ? 'print' : 'dont print'; // ->writeHTML();
// }, $url); $url = urlencode($url); } // 測試檔案 //$url = "query_test/found_book_link.html"; //$url = "query_test/found_book_available.html"; //$url = "query_test/found_book_multi_available.html"; //$url = "query_test/isbn_not_found.html"; //$url = "query_test/found_book_not_available.html"; //echo $url; //$content = file_get_contents($url); //echo $content; //exit(); // -------------------------------------------------- require 'lib/querypath/src/qp.php'; $qp = htmlqp($url); //echo $url; //echo $qp->html(); if ($qp->find('.msg td:contains("無查獲符合查詢條件的館藏;相近 國際標準號碼 是:")')->size() > 0 || $qp->find('.msg td:contains("無查獲符合的,可用相近 國際標準號碼 的是:")')->size() > 0) { // --------------------------------------------- // isbn_not_found // --------------------------------------------- $data = array("error" => "NOT_FOUND"); } else { if ($qp->find('.bibItemsEntry td:contains("可流通")')->size() === 0) { // --------------------------------------------- // found_book_not_available // --------------------------------------------- $full_title = $qp->find('.bibInfoLabel:contains("題名/作者")')->eq(0)->next()->find("strong:first")->text(); $title = substr($full_title, 0, strpos($full_title, " / ")); $title = trim($title);
<?php /** * Helper for bracket.php. Pulls the "official" challonge bracket, removes the stuff we don't need * and styles it to match our theme. * * @license http://www.gnu.org/licenses/gpl-3.0.txt GNU General Public License 3 * @author Sylae Jiendra Corell <*****@*****.**> */ require_once 'config.php'; require 'qp.php'; // don't fall for that 2.x crap. $data = file_get_contents("http://challonge.com/" . $config['challonge_id'] . "/module?theme=2&&match_width_multiplier=0.8"); $info = htmlqp($data)->remove("script, #challonge_promo, .live_stamp"); ob_start(); $info->writeHTML(); $html = ob_get_contents(); ob_end_clean(); echo str_replace(array("</head>", '<a class="btn btn-link match_identifier dropdown-toggle">'), array('<link rel="stylesheet" href="css/bracket_over.css" type="text/css" /></head>', '<a class="match_identifier">'), $html);
protected function parseArticle(\GuzzleHttp\Client $http_client, $base_uri, $section, $section_uri, $full_uri, $article_info) { if (!preg_match('/^https?:\\/\\//', $article_info)) { $article_info = str_replace('/pda/', $full_uri, $article_info); } $article_html = $http_client->get($article_info)->getBody()->getContents(); $article_object = htmlqp($article_html, null, ['convert_to_encoding' => 'UTF-8']); $title = $article_object->find('div#maincontent div.body div.h h2')->get(0); $text = $article_object->find('div#maincontent div.body p')->get(0); $title_string = $title ? trim(htmlqp($title)->text()) : ''; $text_string = $text ? trim(htmlqp($text)->text()) : ''; return new Article($title_string, $text_string, $article_info); }
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title>Web Crawler</title> </head> <body> <h1>Web Crawler Project - 1</h1> <?php include __DIR__ . '/qp/qp.php'; $initial_url = 'https://pro.beatport.com/genre/deep-house/12/tracks'; $content = file_get_contents($initial_url); //load qp with content fetched and initialise from body tag $qp = htmlqp($content, 'body'); echo '<pre>'; if ($qp->length > 0) { //we have some data to parse. $tracks = $qp->find('.track'); foreach ($tracks as $track) { echo 'Track Found:' . $track->find('.buk-track-primary-title')->first()->text() . "\r\n"; } } ?> </body> </html>
require __DIR__ . '/spotify-web-api-php-master/src/Session.php'; require __DIR__ . '/spotify-web-api-php-master/src/SpotifyWebAPIException.php'; $api = new SpotifyWebAPI\SpotifyWebAPI(); $pages = array('https://pro.beatport.com/genre/deep-house/12/tracks'); $done = array(); $final_page = isset($_GET['pages']) ? $_GET['pages'] : 1; //echo '<pre>'; $ipCount = 0; while ($pages) { set_time_limit(0); $link = array_shift($pages); $done[] = $link; //$content = file_get_contents($link); $content = getFile($link); //load qp with content fetched and initialise from body tag $htmlqp = @htmlqp($content, 'body'); if ($htmlqp->length > 0) { //we have some data to parse. $tracks = $htmlqp->find('.track'); foreach ($tracks as $track) { $title = $track->find('.buk-track-primary-title')->first()->text(); $artist = $track->find('.buk-track-artists > a')->first()->text(); $link_to_track = 'https://pro.beatport.com' . $track->find('.buk-track-title > a')->first()->attr('href'); //CHECK IF ARTIST ALREADY EXIST IN DATABASE, PRIOR TO SEARCHING ON SPOTIFY $artist_spotify_id = $db->querySingle("select Artist_spotify_id from artist where Artist_name='" . SQLite3::escapeString($artist) . "'"); //Like msq_realescape. if (!$artist_spotify_id) { // If not in database -- get id via Spotify API. $spotify_artist = $api->search($artist, 'artist'); //Geting artist id via Spotify Api foreach ($spotify_artist->artists->items as $spotify_id) {
function inBedify($url) { require 'QueryPath/QueryPath.php'; $page = htmlqp($url); if (!$page) { return; } $url_parts = parse_url($url); // Check response code @todo $base = $url_parts['scheme'] . '://' . $url_parts['host']; // Convert relative URLs to absolute. foreach (qp($page, 'link') as $link) { if ($link->hasAttr('href') && strpos($link->attr('href'), 'http') === false) { $link->attr('href', $base . '/' . ltrim($link->attr('href'), '/')); } } foreach (qp($page, 'script') as $script) { if ($script->hasAttr('src') && strpos($script->attr('src'), 'http') === false) { $script->attr('src', $base . '/' . ltrim($script->attr('src'), '/')); } } foreach (qp($page, 'style') as $style) { if (preg_match('/@import ?["\']\/(.*)/', $style->text(), $matches) && count($matches) > 1) { $style->text('@import "' . $base . '/' . $matches[1]); } } foreach (qp($page, 'img') as $img) { if (strpos($img->attr('src'), 'http') === false) { $img->attr('src', $base . '/' . ltrim($img->attr('src'), '/')); } } // Rewrite same-domain URLs to run through InBedify. foreach (qp($page, 'a') as $a) { if (strpos($a->attr('href'), 'http') !== false) { // Only rewrite same-domain URLs. $host = parse_url($a->attr('href'), PHP_URL_HOST); if ($host == $url_parts['host']) { //@todo $a->attr('href', 'http://withbaconfy.com/' . $a->attr('href')); } } else { // Relative URL. $a->attr('href', 'http://withbaconfy.com/' . $base . '/' . ltrim($a->attr('href'), '/')); } } // InBedify! // Speed this up @todo foreach (qp($page, 'h1') as $header) { inBedElement($header); } foreach (qp($page, 'h2') as $header) { inBedElement($header); } foreach (qp($page, 'h3') as $header) { inBedElement($header); } print $page->html(); exit; }
/** * Downloads html and temporary saves them in debug mode * * @param string $url * @param array $options * * @return QueryPath */ function getContent($url, $options = []) { if (App::config("debug")) { $fileCache = App::config("tmp") . md5($url); if (file_exists($fileCache)) { $html = file_get_contents($fileCache); if (isset($options["encoding"])) { $html = $this->fixEncoding($html); } return htmlqp($html); } } $html = Utils::curl($url); // lower string size if (isset($options["onlyBody"])) { if (strpos('<body>', $html) !== false) { $tmp = explode('<body>', $html); $tmp = explode('</body>', $tmp[1]); $html = $tmp[0]; } } if (App::config("debug")) { file_put_contents($fileCache, $html); } if (isset($options["encoding"])) { $html = $this->fixEncoding($html); } return htmlqp($html); }
protected function processItems() { //ensure that we are not appending to old data (i.e. if this method is called more than once) $this->SetPostingsToEmpty(); $entirepage = htmlqp($this->GetRequestData()); $metadata = $entirepage->find('.documentBottomLine')->children('.documentByLine'); $author = $this->getAuthor($metadata->text()); $date = $this->convertDate($metadata->text()); $link = $this->source; if ($this->heading === false) { $items = $entirepage->find('h1.documentFirstHeading'); foreach ($items as $item) { $text = $this->tidyText($this->prependText($item->text())); $this->AppendToPostings($date, $author, $text, $link); } } else { $this->AppendToPostings($date, $author, $this->prependText($this->heading), $link); } }
protected function GetDocumentFromURL(string $URL) : QueryPath\DOMQuery { /*// download the page from the specified thing and attempt to parse it as a valid html thing. //*/ $this->PrintLine(">> Fetching {$URL}"); $HTML = file_get_contents($URL); if (!$HTML) { throw new Exception("unable to fetch {$URL}"); } //////// $Document = @htmlqp($HTML); if (!$Document) { throw new Exception("unable to parse {$URL}"); } //////// return $Document; }
function get_url_suggestions() { if (!isset($this->user) || !isset($this->user_company)) { set_status_header(401); return; } require_once APPPATH . 'third_party/querypath-3.0.4/src/qp.php'; $data = []; if ($url = $this->input->get("url")) { libxml_use_internal_errors(true); $qp = htmlqp($url); //Check GA $html = strtolower($qp->html()); if (strpos($html, 'www.google-analytics.com/analytics.js') !== FALSE) { $data['ga'] = 1; } elseif (strpos($html, 'www.google-analytics.com/ga.js') !== FALSE) { $data['ga'] = 2; } // Check Author $author1 = $qp->find(".author"); if ($author1->count() == 1) { $data['author_text'] = $author1->text(); $data['author_class'] = '.author'; } if (!isset($data['author_class'])) { $author2 = $qp->find("meta[name=author]"); if ($author2->count() == 1) { $data['author_text'] = $author2->attr('content'); $data['author_class'] = 'meta[name=author]'; } } if (!isset($data['author_class'])) { $author3 = $qp->find("*[class*='author']"); $classes = []; foreach ($author3->get() as $el) { $classes[] = $el->getAttribute('class'); } $classes = array_count_values(array_map('strtolower', $classes)); foreach ($classes as $class => $count) { if ($count == 1) { $data['author_text'] = $qp->find('.' . str_replace(' ', '.', $class))->text(); $data['author_class'] = '.' . str_replace(' ', '.', $class); break; } } } //check URL $url1 = $qp->find("meta[property='og:url']"); if ($url1->count() >= 1) { $data['url_text'] = $url1->attr('content'); $data['url_option'] = 1; } if (!isset($data['url_option'])) { $url2 = $qp->find("link[rel='canonical']"); if ($url2->count() >= 1) { $data['url_text'] = $url2->attr('href'); $data['url_option'] = 2; } } if (!isset($data['url_option'])) { $data['url_text'] = $this->input->get("url"); $data['url_option'] = 3; } } else { $data["error"] = "We couldn't retrieve the page."; } $this->output->set_content_type('application/json')->set_output(json_encode($data)); }
<?php #require_once APPPATH."/third_party/querypath-2.1.2/QueryPath/QueryPath.php"; require_once APPPATH . "/libraries/QueryPath2.php"; #qp('http://127.0.0.1:8090/common/main/sndmail_00700211')->find('test1')->text('Hello World')->writeHTML(); function xmp_print($arr) { echo '<xmp>'; print_r($arr); echo '</xmp>'; } $html = '<!DOCTYPE html> <html> <head> <title>예제</title> </head> <body> <p class="a" id="test1">다람쥐 헌 쳇바퀴<br>타고파.</p> <p class="a" id="test2">다람쥐가노래를한<b>다</b> 람쥐.</p> <p>다람쥐</p> </body> </html>'; $children = htmlqp($html, 'body', array('convert_to_encoding' => 'utf-8'))->children('p.a'); foreach ($children as $child) { $node = $child; xmp_print($node); }
/** * Given a QTI XML file, extract questions. */ function qti_extract_info($file) { $items = array(); foreach(qp($file, 'item') as $item) { //Get negative score $negative_score = $item->branch()->xpath('/questestinterop/item/resprocessing/respcondition/conditionvar/not/varequal/../../../setvar')->text(); //Handle feedback $feedback_incorrect_linkrefid = $item->branch()->xpath('/questestinterop/item/resprocessing/respcondition/setvar[text()<=0]/../displayfeedback/@linkrefid')->text(); $feedback_incorrect = node_to_text($item->branch()->xpath('/questestinterop/item/itemfeedback[@ident="' . $feedback_incorrect_linkrefid . '"]/material'), true); $title = $item->attr('title'); $type = $item->find('itemmetadata>qmd_itemtype')->text(); $body = $item->end()->find('presentation>material'); if ($body->attr('texttype') == 'text/html') { $bodytext = $body->text(); if (strpos($bodytext, '<html') === FALSE) { $bodytext = '<html>' . $bodytext . '</html>'; } $doc = new DOMDocument(); //supress query path warnings. @$doc->loadHTML($bodytext); $html = htmlqp($doc, 'body'); //Handles emphasized text $contents = $html->get(0)->childNodes; // Extract HTML content $newdoc = qp(); $i = 0; while ($node = $contents->item($i++)) { $newdoc->append($contents); } $out = strip_tags($newdoc->html()); // This leaves off XML declaration. } else { //$out = strip_tags($body->text()); $out = node_to_text($body, false); } $new_item = array( 'title' => $title, 'type' => $type, 'content' => $out, 'answers' => array() ); $answers = array(); // Get all answers and loop through them. $answerstexts = $item->parent('item')->find('response_lid>render_choice>response_label>material>mattext'); $num_of_correct_answers = 0; $answers = array(); foreach ($answerstexts as $answertext) { $text = $answertext->text(); $index = $answertext->parent('response_label')->attr('ident'); $filter_weight = 'resprocessing>respcondition>conditionvar>varequal:contains(' . $index . ')'; $weight = $answertext->parent('item')->find($filter_weight)->parent('respcondition')->find('setvar')->text(); $index_feedback = $answertext->end()->parent('item')->find($filter_weight)->parent('respcondition')->find('displayfeedback')->attr('linkrefid'); $filter_feedback = '//itemfeedback[@ident="' . $index_feedback . '"]'; $feedback = ""; $feedback = $answertext->end()->parent('item')->xpath($filter_feedback)->text(); if($weight == '') { $feedback = $feedback_incorrect; $weight = $negative_score; } $is_correct = false; if($weight > 0) { $is_correct = true; $num_of_correct_answers++; } $answers[] = array( 'text' => $text, 'index' => $index, 'is_correct' => $weight>0, 'feedback' => $feedback, 'weight' => $weight ); //Store answers $new_item['answers'] = $answers; } $new_item['num_of_correct_answers'] = $num_of_correct_answers; // Store questions $items[] = $new_item; } return $items; }