/**
  * Get Captcha Value and input in box
  */
 function captcha($imglocation, $typeinbox)
 {
     $html = $this->getSource();
     $tidy = tidy_parse_string($html)->html()->value;
     $searchqp = htmlqp($tidy, 'body');
     $captchaurl = $searchqp->branch($imglocation)->attr('src');
     $saveimg = '/tmp/mycaptcha.png';
     file_put_contents($saveimg, file_get_contents($captchaurl));
     $tesseract = new TesseractOCR($saveimg);
     $crackedvalue = $tesseract->recognize();
     $this->driver->findElement(WebDriverBy::CssSelector($typeinbox))->sendKeys($crackedvalue);
 }
 /**
  * @param string $name
  * @return string
  */
 public function extractValue($name)
 {
     if (isset($this->extractedValues[$name])) {
         return $this->extractedValues[$name];
     }
     $mapping = isset($this->itemMapping[$name]) ? $this->itemMapping[$name] : $name;
     if (is_string($mapping)) {
         $mapping = array('selector' => $mapping);
     }
     $value = !empty($mapping['defaultValue']) ? $mapping['defaultValue'] : '';
     if (empty($mapping['selector']) && empty($mapping['defaultValue'])) {
         throw new \RuntimeException('Missing \'selector\' or \'defaultValue\' for ' . htmlentities($name) . ' mapping');
     }
     if (!empty($mapping['selector'])) {
         if (!empty($mapping['source'])) {
             $source = $this->extractorService->extractValue($this->item, $mapping['source']);
             $source = $this->extractorService->fetchRawContent($source);
             try {
                 $item = qp($source);
             } catch (\QueryPath\Exception $e) {
                 $item = htmlqp($source);
             }
         } else {
             $item = $this->item;
         }
         $value = $this->extractorService->extractValue($item, $mapping, $value);
     }
     $this->extractedValues[$name] = $value;
     return $this->extractedValues[$name];
 }
 protected function processRawData($data_raw)
 {
     $data = [];
     foreach ($data_raw as $html_raw) {
         $td_array = htmlqp($html_raw)->find('.table-striped td')->get();
         foreach ($td_array as $td) {
             $data[] = trim(htmlqp($td)->text());
         }
     }
     return $data;
 }
 protected function parseArticle(\GuzzleHttp\Client $http_client, $base_uri, $section, $section_uri, $full_uri, $article_info)
 {
     if (!preg_match('/^https?:\\/\\//', $article_info)) {
         $article_info = rtrim($base_uri, '/') . $article_info;
     }
     $article_html = $http_client->get($article_info)->getBody()->getContents();
     $article_html = mb_convert_encoding($article_html, 'UTF-8', 'WINDOWS-1251');
     $article_object = htmlqp($article_html, null, ['convert_to_encoding' => 'UTF-8']);
     $title = $article_object->find('div.main-container div.content article h1')->get(0);
     $text = $article_object->find('div.main-container div.content div.article-content p')->get(0);
     $title_string = $title ? trim(htmlqp($title)->text()) : '';
     $text_string = $text ? trim(htmlqp($text)->text()) : '';
     $title_string = mb_convert_encoding($title_string, 'WINDOWS-1251', 'UTF-8');
     $text_string = mb_convert_encoding($text_string, 'WINDOWS-1251', 'UTF-8');
     return new Article($title_string, $text_string, $article_info);
 }
 protected function processItems()
 {
     //ensure that we are not appending to old data (i.e. if this method is called more than once)
     $this->SetPostingsToEmpty();
     $entirepage = htmlqp($this->GetRequestData());
     $metadata = $entirepage->find('.documentBottomLine')->children('.documentByLine');
     $author = $this->getAuthor($metadata->text());
     $date = $this->convertDate($metadata->text());
     $link = $this->source;
     $items = $entirepage->find('.documentContent')->find('div#bodyContent.plain')->children('h2');
     foreach ($items as $item) {
         if ($this->isPseudoInfo($item->text())) {
             // skip pseudo "information"
             continue;
         }
         $text = $this->tidyText($this->prependText($item->text()));
         $this->AppendToPostings($date, $author, $text, $link);
     }
 }
 public function extractDataFromHtml($html)
 {
     $definitions = array();
     /** @var \QueryPath $qp */
     $qp = htmlqp($html, '.super_group', array('ignore_parser_warnings' => true));
     /** @var \QueryPath\DOMQuery $group */
     foreach ($qp as $group) {
         if (strpos($group->attr('id'), "query-") === 0) {
             $title = $group->find('h2')->text();
             $definitions[$title] = array();
             $urlPath = $group->find('.method_details_list .url')->text();
             $method = $group->find('.method_details_list .action')->text();
             $tables = $group->find('.parameters > .data_table');
             $objectFields = $this->extractFieldsFromTables($tables);
             $definitions[$title]['fields'] = $objectFields;
             $definitions[$title]['method'] = $method;
             $definitions[$title]['urlPath'] = $urlPath;
         }
     }
     return $definitions;
 }
 public function scrape()
 {
     $cinema_url = $this->get_url();
     $cinema = $this->get_cinema();
     $html = htmlqp($cinema_url);
     $theater = $html->find('h2')->text();
     $address = $html->top()->find('div.info', 0)->text();
     foreach ($html->top()->find('.movie') as $movie) {
         $filme = new Movie();
         $filme->name = $movie->children('.name a')->text();
         $meta = $movie->parent()->children('.info')->text();
         $this->set_movie_meta($meta, $filme);
         $showtimes = $movie->parent()->find('.times')->text();
         $showtimes = explode(" ", htmlentities($showtimes));
         foreach ($showtimes as $showtime) {
             $filme->set_showtime($showtime);
         }
         $cinema->set_movie($filme);
     }
     return $cinema;
 }
 protected function processRawData($data_raw)
 {
     $proxies = [];
     foreach ($data_raw as $item) {
         $item_type = $item[0];
         $html_raw = $item[1];
         $tr_array = htmlqp($html_raw)->find('table#proxylisttable tbody tr')->get();
         foreach ($tr_array as $tr) {
             $td_array = htmlqp($tr)->find('td')->get();
             $properties = array_map(function ($td) {
                 return trim(htmlqp($td)->text());
             }, $td_array);
             $proxy = ['protocol' => null, 'ip' => null, 'port' => null, 'country_code' => null, 'anonymity' => null, 'google' => null, 'https' => null];
             switch ($item_type) {
                 case FreeProxyListNetBackend::TYPE_HTTP:
                     $proxy['protocol'] = 'http';
                     $proxy['ip'] = $properties[self::COLUMN_HTTP_IP];
                     $proxy['port'] = (int) $properties[self::COLUMN_HTTP_PORT];
                     $proxy['country_code'] = $properties[self::COLUMN_HTTP_COUNTRY_CODE];
                     $proxy['anonymity'] = $properties[self::COLUMN_HTTP_ANONYMITY];
                     $proxy['google'] = 'yes' === strtolower($properties[self::COLUMN_HTTP_GOOGLE]);
                     $proxy['https'] = 'yes' === strtolower($properties[self::COLUMN_HTTP_HTTPS]);
                     break;
                 case FreeProxyListNetBackend::TYPE_SOCKS:
                     $proxy['ip'] = $properties[self::COLUMN_SOCKS_IP];
                     $proxy['port'] = (int) $properties[self::COLUMN_SOCKS_PORT];
                     $proxy['country_code'] = $properties[self::COLUMN_SOCKS_COUNTRY_CODE];
                     $proxy['anonymity'] = $properties[self::COLUMN_SOCKS_ANONYMITY];
                     $proxy['google'] = false;
                     $proxy['https'] = 'yes' === strtolower($properties[self::COLUMN_SOCKS_HTTPS]);
                     $proxy['protocol'] = 'socks5' === strtolower($properties[self::COLUMN_SOCKS_VERSION]) ? 'socks5' : 'socks4';
                     break;
                 default:
                     throw new ProxyProviderException('Raw data is invalid');
             }
             $proxies[] = $proxy;
         }
     }
     return $proxies;
 }
 protected function processItems()
 {
     //ensure that we are not appending to old data (i.e. if this method is called more than once)
     $this->SetPostingsToEmpty();
     //process the actual data
     $items = htmlqp($this->GetRequestData(), '#tudevent_box', $this->overrideEncoding())->find('.portletContent');
     foreach ($items as $item) {
         if ($item->children('.tudeventlist-eventdate')->count() == 1 and $item->children('.tudeventlist-linkedtext')->count() == 1) {
             $date = $item->children('.tudeventlist-eventdate')->text();
             if (($date = $this->getDates($date)) === false) {
                 //some strange event without any date given; skip it
                 continue;
             }
             $link = $item->children('.tudeventlist-linkedtext')->children('a')->attr('href');
             $text = $item->children('.tudeventlist-linkedtext')->text();
             $text = $this->tidyText($this->prependText($text));
             //TODO feature enhancement: we could determine the author via subsequent calls to the linked calendar entries.
             //but since we are not printing the author anyways, this is currently not implemented
             $author = "n/a";
             $this->AppendToPostings($date, $author, $text, $link);
         }
     }
 }
 * @author M Butcher <*****@*****.**>
 * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license.
 */
/** Include QueryPath. */
require_once '../src/QueryPath/QueryPath.php';
/**
 * Check if the string 'Release' is in the text content of any matched nodes.
 * 
 * Returns TRUE if the text is found, FALSE otherwise. Anytime a filter callback
 * returns FALSE, QueryPath will remove it from the matches.
 *
 * Note that $item is a DOMNode (actually, a DOMElement). So if we wanted to do QueryPath
 * manipulations on it, we could wrap it in a `qp()`.
 */
function exampleCallback($index, $item)
{
    $text = qp($item)->text();
    return strpos($text, 'Release') !== FALSE;
}
/*
 * This is the QueryPath call.
 *
 * First we fetch the remote page, parse it, and grab just the `a` tags inside of the summary.
 * Then we filter the results through our callback.
 * Finally, we fetch all of the matching text and print it.
 *
 * NOTE: If you are using PHP 5.3, you can define the callback inline instead of separating it
 * into a stand-alone function.
 */
print htmlqp('http://php.net/', 'h1.summary a')->filterCallback('exampleCallback')->textImplode(PHP_EOL);
 /**
  * Attach image to the post id.
  */
 private function _attach_image($the_post_id)
 {
     // get the image source
     $image_src = htmlqp($this->_my_article->content, '.thumbinner img')->attr("src");
     // download the image and put in the
     if (isset($image_src)) {
         $image_src = str_replace("220px-", '640px-', $image_src);
         if (file_exists($image_src)) {
             $this->_save_image("http:" . $image_src, $the_post_id);
         } else {
             $image_src = str_replace("640px-", '220px-', $image_src);
             $this->_save_image("http:" . $image_src, $the_post_id);
         }
     }
 }
Exemple #12
0
 private function _filterProject($data)
 {
     $project = false;
     // get the important or filterd values only
     $arr = array();
     $arr['id'] = $data['id'];
     $arr['name'] = $data['name'];
     $arr['description'] = $data['description'];
     $imgxl = max(array_keys($data['covers']));
     $arr['cover'] = $this->_cacheImage($data['covers'][$imgxl], $data['id'], 'covers');
     $arr['url'] = $data['url'];
     $arr['published'] = $data['published_on'];
     $arr['fields'] = $data['fields'];
     $arr['tags'] = $data['tags'];
     // querypath comes in
     $index = 0;
     foreach ($data['modules'] as $module) {
         if ($module['type'] == "text") {
             $string = $module['text'];
             $qp = htmlqp($string);
             // change into h2 elements
             $div = $qp->find('span[style="font-size: 22px;"]')->parent('div');
             $color = $div->find('span[style~="color:"]')->attr('style');
             $inner = $div->text();
             if (!$inner) {
                 $div = $qp->find('span[style="font-size:22px;"]')->parent('div');
                 $color = $div->find('span[style~="color:"]')->attr('style');
                 $inner = $div->text();
             }
             $div->html('<h2 style="' . $color . '">' . $inner . '</h2>');
             // change bold titles/subtitles into h3 elements
             foreach ($qp->find('span.bold')->parent('div') as $div) {
                 $newdiv = $div->find('span[style="font-size: 12px;"]')->parent('div');
                 if (is_null($newdiv)) {
                     $newdiv = $div->find('span[style="font-size:12px;"]')->parent('div');
                 }
                 if (!is_null($newdiv)) {
                     $inner = $div->text();
                     $color = $div->find('span[style~="color:"]')->attr('style');
                     $div->html('<h3 style="' . $color . '">' . $inner . '</h3>');
                 }
             }
             $markup = strip_tags($qp->html(), '<div><h2><h3><span><a><p>');
             unset($data['modules'][$index]['text_plain']);
             $data['modules'][$index]['text'] = $markup;
         } elseif ($module['type'] == "image") {
             unset($data['modules'][$index]['sizes']);
             $data['modules'][$index]['src'] = $this->_cacheImage($module['src'], $module['id'], 'images');
         }
         // next module
         $index++;
     }
     $arr['modules'] = $data['modules'];
     // array_push($project, $arr);
     $project = $arr;
     return $project;
 }
Exemple #13
0
 /**
  * Test alternate constructors.
  * @group basic
  */
 public function testDOMQueryHtmlConstructors()
 {
     $qp = htmlqp(\QueryPath::HTML_STUB);
     $this->assertEquals(1, count($qp->get()));
     $this->assertTrue($qp->get(0) instanceof \DOMNode);
     // Bad BR tag.
     $borken = '<html><head></head><body><br></body></html>';
     $qp = htmlqp($borken);
     $this->assertEquals(1, count($qp->get()));
     $this->assertTrue($qp->get(0) instanceof \DOMNode);
     // XHTML Faker
     $borken = '<?xml version="1.0"?><html><head></head><body><br></body></html>';
     $qp = htmlqp($borken);
     $this->assertEquals(1, count($qp->get()));
     $this->assertTrue($qp->get(0) instanceof \DOMNode);
     // HTML in a file that looks like XML.
     $qp = htmlqp(HTML_IN_XML_FILE);
     $this->assertEquals(1, count($qp->get()));
     $this->assertTrue($qp->get(0) instanceof \DOMNode);
     // HTML5
     $html5 = new \Masterminds\HTML5();
     $dom = $html5->loadHTML(\QueryPath::HTML_STUB);
     qp($dom, 'html');
     // Stripping #13 (CR) from HTML.
     $borken = '<html><head></head><body><p>' . chr(13) . '</p><div id="after"/></body></html>';
     $this->assertFalse(strpos(htmlqp($borken)->html(), '&#13;'), 'Test that CRs are not encoded.');
     // Regression for #58: Make sure we aren't getting &#10; encoded.
     $borken = '<html><head><style>
     .BlueText {
       color:red;
     }</style><body></body></html>';
     $this->assertFalse(strpos(htmlqp($borken)->html(), '&#10;'), 'Test that LF is not encoded.');
     // Low ASCII in a file
     $borken = '<html><head></head><body><p>' . chr(27) . '</p><div id="after"/></body></html>';
     $this->assertEquals(1, htmlqp($borken, '#after')->size());
 }
function jjamerson_preprocess_block(&$variables)
{
    /* Add an item count to the menu itself */
    if ($variables['block']->module === 'menu' || $variables['block_html_id'] === 'block-system-main-menu' || $variables['block']->module === 'berklee_site_section' || strpos($variables['block']->css_class, 'main-menu-block') > -1 && isset($variables['elements'])) {
        if (isset($variables['block']->subject) && $variables['block']->subject > '') {
            $aria_label = "aria-label='" . strip_tags($variables['block']->subject) . "'";
        } else {
            $aria_label = '';
        }
        $page_menus[$variables['block_html_id']] = $variables['block_html_id'];
        $counter = 0;
        foreach ($variables['elements'] as $element) {
            if (is_array($element) && isset($element['#original_link'])) {
                $counter++;
            }
        }
        // This may be a block where the menu is already rendered in the content region. If so,
        // we parse the content region for parent-level list items.
        if ($counter === 0 && isset($variables['content']) && in_array('main-menu-block', $variables['classes_array'])) {
            // we'll use querypath to parse. https://www.drupal.org/project/querypath | http://querypath.org/
            if (function_exists('htmlqp')) {
                try {
                    $content_qp = htmlqp($variables['content']);
                    $child_menus = $content_qp->remove('ul ul');
                    $content_qp->top('ul');
                    $counter = $content_qp->find('li')->length;
                } catch (Exception $e) {
                }
            }
        }
        if ($counter > 0) {
            /* Add it both as a class and as an attribute. The attribute is easier to
               grab & work with in JS. */
            $variables['classes_array'][] = 'item-count-' . $counter;
            $variables['attributes_array']['item-count'] = $counter;
        }
        $variables['content'] = "<nav role='navigation' {$aria_label}>" . $variables['content'] . '</nav>';
    }
}
<?php

/**
 * Urban Dictionary Random Word Generator
 *
 * 
 * @author Emily Brand
 * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license.
 * @see http://www.urbandictionary.com/
 */
require_once '../src/QueryPath/QueryPath.php';
print '<h3>Urban Dictionary Random Word Generator</h3>';
$page = rand(0, 288);
$qp = htmlqp('http://www.urbandictionary.com/?page=' . $page, '#home');
$rand = rand(0, 7);
print $qp->find('.word')->eq($rand)->text() . '<br />';
print $qp->top()->find('.definition')->eq($rand)->text();
Exemple #16
0
 static function get_post($url)
 {
     require_once APPPATH . 'third_party/querypath-3.0.4/src/qp.php';
     libxml_use_internal_errors(true);
     $qp = htmlqp($url);
     $data = array('url' => $url);
     //title
     $title = $qp->find("meta[property='og:title']");
     if ($title->count()) {
         $data['title'] = $title->attr('content');
     } else {
         $title = $qp->find("meta[property='twitter:title']");
         if ($title->count()) {
             $data['title'] = $title->attr('content');
         } else {
             $data['title'] = $qp->find("title")->text();
         }
     }
     //image
     $image = $qp->find("meta[property='og:image']");
     if ($image->count()) {
         $data['image'] = $image->attr('content');
     } else {
         $image = $qp->find("meta[property='twitter:image:src']");
         if ($image->count()) {
             $data['image'] = $image->attr('content');
         }
     }
     //save images
     if ($data['image']) {
         $original_url = $data['image'];
         $md5 = substr(md5($original_url . mt_rand()), 0, 12);
         $local_dir = substr($md5, 0, 2) . DIRECTORY_SEPARATOR . substr($md5, 2, 2) . DIRECTORY_SEPARATOR;
         $local = $local_dir . substr($md5, 4);
         $url = substr($md5, 0, 2) . '/' . substr($md5, 2, 2) . '/' . substr($md5, 4);
         $path = $original_url;
         $qpos = strpos($path, "?");
         if ($qpos !== false) {
             $path = substr($path, 0, $qpos);
         }
         $extension = pathinfo($path, PATHINFO_EXTENSION);
         if ($extension != "") {
             $local .= "." . $extension;
             $url .= "." . $extension;
         }
         $local = FCPATH . "images" . DIRECTORY_SEPARATOR . "cache" . DIRECTORY_SEPARATOR . $local;
         $local_dir = FCPATH . "images" . DIRECTORY_SEPARATOR . "cache" . DIRECTORY_SEPARATOR . $local_dir;
         $url = "/images/cache/" . $url;
         !is_dir($local_dir) && mkdir($local_dir, 0777, TRUE);
         copy($original_url, $local);
         if (file_exists($local)) {
             $data['image'] = $url;
         } else {
             unset($data['image']);
         }
     }
     //Date
     $date = $qp->find("meta[property='article:published_time']");
     if ($date->count() > 1) {
         $data['date_published'] = $date->attr('content');
     } else {
         $date = $qp->find('time');
         if ($date->count() == 1) {
             if ($date->attr('datetime')) {
                 $data['date_published'] = $date->attr('datetime');
             } else {
                 $data['date_published'] = $date->text();
             }
         } else {
             $date = $qp->find('article time');
             if ($date->count() == 1) {
                 if ($date->attr('datetime')) {
                     $data['date_published'] = $date->attr('datetime');
                 } else {
                     $data['date_published'] = $date->text();
                 }
             } else {
                 $date = $qp->find('article header time');
                 if ($date->count() == 1) {
                     if ($date->attr('datetime')) {
                         $data['date_published'] = $date->attr('datetime');
                     } else {
                         $data['date_published'] = $date->text();
                     }
                 } else {
                 }
             }
         }
     }
     $author = $qp->find(".author");
     if ($author->count() == 1) {
         $data['author'] = $author->text();
     }
     if (!isset($data['author'])) {
         $author = $qp->find("meta[name=author]");
         if ($author->count() == 1) {
             $data['author'] = $author->attr('content');
         }
     }
     if (!isset($data['author'])) {
         $author = $qp->find("*[class*='author']");
         $classes = [];
         foreach ($author->get() as $el) {
             $classes[] = $el->getAttribute('class');
         }
         $classes = array_count_values(array_map('strtolower', $classes));
         foreach ($classes as $class => $count) {
             if ($count == 1) {
                 $data['author'] = $qp->find('.' . str_replace(' ', '.', $class))->text();
                 break;
             }
         }
     }
     return $data;
 }
 /**
  * @param string $string
  * @return \QueryPath\DOMQuery
  */
 public function stringToDOMQuery($string)
 {
     try {
         $domQuery = qp($string);
     } catch (\QueryPath\Exception $e) {
         $domQuery = htmlqp($string);
     }
     return $domQuery;
 }
<?php

/**
 * Basic example of QueryPath usage.
 *
 * This two-line example exhibits basic use of QueryPath. It creates a new 
 * HTML document and adds the typical 'Hello World' text to the body. It then writes
 * that information to standard out (which is flushed to a web browser in most cases.)
 *
 * The important methods covered here are {@link qp()}, which is the {@link QueryPath}
 * factory function, {@link QueryPath::find()}, which is the primary searching 
 * function, and {@link QueryPath::writeHTML()}, which is a utility function.
 *
 * This file is fully explained in the official QueryPath tutorial, located 
 * at {@link https://fedorahosted.org/querypath/wiki/QueryPathTutorial}
 *
 * 
 * @author M Butcher <*****@*****.**>
 * @license LGPL The GNU Lesser GPL (LGPL) or an MIT-like license.
 * @see qp()
 * @see QueryPath::find()
 * @see QueryPath::writeHTML()
 * @see html.php
 * @see https://fedorahosted.org/querypath/wiki/QueryPathTutorial The Official Tutorial
 */
require_once '../src/QueryPath/QueryPath.php';
qp(QueryPath::HTML_STUB)->find('body')->text('Hello World')->writeHTML();
$qp = htmlqp(QueryPath::HTML_STUB, 'body');
$qp->append('<div></div><p id="cool">Hello</p><p id="notcool">Goodbye</p>')->children('p')->after('<p id="new">new paragraph</p>');
echo $qp->find('p')->children('p')->html() ? 'print' : 'dont print';
//         ->writeHTML();
    //    }, $url);
    $url = urlencode($url);
}
// 測試檔案
//$url = "query_test/found_book_link.html";
//$url = "query_test/found_book_available.html";
//$url = "query_test/found_book_multi_available.html";
//$url = "query_test/isbn_not_found.html";
//$url = "query_test/found_book_not_available.html";
//echo $url;
//$content = file_get_contents($url);
//echo $content;
//exit();
// --------------------------------------------------
require 'lib/querypath/src/qp.php';
$qp = htmlqp($url);
//echo $url;
//echo $qp->html();
if ($qp->find('.msg td:contains("無查獲符合查詢條件的館藏;相近 國際標準號碼 是:")')->size() > 0 || $qp->find('.msg td:contains("無查獲符合的,可用相近 國際標準號碼 的是:")')->size() > 0) {
    // ---------------------------------------------
    // isbn_not_found
    // ---------------------------------------------
    $data = array("error" => "NOT_FOUND");
} else {
    if ($qp->find('.bibItemsEntry td:contains("可流通")')->size() === 0) {
        // ---------------------------------------------
        // found_book_not_available
        // ---------------------------------------------
        $full_title = $qp->find('.bibInfoLabel:contains("題名/作者")')->eq(0)->next()->find("strong:first")->text();
        $title = substr($full_title, 0, strpos($full_title, " / "));
        $title = trim($title);
Exemple #20
0
<?php

/**
 * Helper for bracket.php. Pulls the "official" challonge bracket, removes the stuff we don't need
 * and styles it to match our theme.
 *
 * @license http://www.gnu.org/licenses/gpl-3.0.txt GNU General Public License 3
 * @author Sylae Jiendra Corell <*****@*****.**>
 */
require_once 'config.php';
require 'qp.php';
// don't fall for that 2.x crap.
$data = file_get_contents("http://challonge.com/" . $config['challonge_id'] . "/module?theme=2&&match_width_multiplier=0.8");
$info = htmlqp($data)->remove("script, #challonge_promo, .live_stamp");
ob_start();
$info->writeHTML();
$html = ob_get_contents();
ob_end_clean();
echo str_replace(array("</head>", '<a class="btn btn-link match_identifier dropdown-toggle">'), array('<link rel="stylesheet" href="css/bracket_over.css" type="text/css" /></head>', '<a class="match_identifier">'), $html);
Exemple #21
0
 protected function parseArticle(\GuzzleHttp\Client $http_client, $base_uri, $section, $section_uri, $full_uri, $article_info)
 {
     if (!preg_match('/^https?:\\/\\//', $article_info)) {
         $article_info = str_replace('/pda/', $full_uri, $article_info);
     }
     $article_html = $http_client->get($article_info)->getBody()->getContents();
     $article_object = htmlqp($article_html, null, ['convert_to_encoding' => 'UTF-8']);
     $title = $article_object->find('div#maincontent div.body div.h h2')->get(0);
     $text = $article_object->find('div#maincontent div.body p')->get(0);
     $title_string = $title ? trim(htmlqp($title)->text()) : '';
     $text_string = $text ? trim(htmlqp($text)->text()) : '';
     return new Article($title_string, $text_string, $article_info);
 }
<!DOCTYPE html>
<html>
    <head>
        <meta charset="UTF-8">
        <title>Web Crawler</title>
    </head>
    <body>
        <h1>Web Crawler Project - 1</h1>
        <?php 
include __DIR__ . '/qp/qp.php';
$initial_url = 'https://pro.beatport.com/genre/deep-house/12/tracks';
$content = file_get_contents($initial_url);
//load qp with content fetched and initialise from body tag
$qp = htmlqp($content, 'body');
echo '<pre>';
if ($qp->length > 0) {
    //we have some data to parse.
    $tracks = $qp->find('.track');
    foreach ($tracks as $track) {
        echo 'Track Found:' . $track->find('.buk-track-primary-title')->first()->text() . "\r\n";
    }
}
?>
    </body>
</html>
require __DIR__ . '/spotify-web-api-php-master/src/Session.php';
require __DIR__ . '/spotify-web-api-php-master/src/SpotifyWebAPIException.php';
$api = new SpotifyWebAPI\SpotifyWebAPI();
$pages = array('https://pro.beatport.com/genre/deep-house/12/tracks');
$done = array();
$final_page = isset($_GET['pages']) ? $_GET['pages'] : 1;
//echo '<pre>';
$ipCount = 0;
while ($pages) {
    set_time_limit(0);
    $link = array_shift($pages);
    $done[] = $link;
    //$content = file_get_contents($link);
    $content = getFile($link);
    //load qp with content fetched and initialise from body tag
    $htmlqp = @htmlqp($content, 'body');
    if ($htmlqp->length > 0) {
        //we have some data to parse.
        $tracks = $htmlqp->find('.track');
        foreach ($tracks as $track) {
            $title = $track->find('.buk-track-primary-title')->first()->text();
            $artist = $track->find('.buk-track-artists > a')->first()->text();
            $link_to_track = 'https://pro.beatport.com' . $track->find('.buk-track-title > a')->first()->attr('href');
            //CHECK IF ARTIST ALREADY EXIST IN DATABASE, PRIOR TO SEARCHING ON SPOTIFY
            $artist_spotify_id = $db->querySingle("select Artist_spotify_id from artist where Artist_name='" . SQLite3::escapeString($artist) . "'");
            //Like msq_realescape.
            if (!$artist_spotify_id) {
                // If not in database -- get id via Spotify API.
                $spotify_artist = $api->search($artist, 'artist');
                //Geting artist id via Spotify Api
                foreach ($spotify_artist->artists->items as $spotify_id) {
Exemple #24
0
function inBedify($url) {
  require 'QueryPath/QueryPath.php';
  $page = htmlqp($url);
  if (!$page) {
    return;
  }
  $url_parts = parse_url($url);
  // Check response code @todo
  $base = $url_parts['scheme'] . '://' . $url_parts['host'];

  // Convert relative URLs to absolute.
  foreach (qp($page, 'link') as $link) {
    if ($link->hasAttr('href') && strpos($link->attr('href'), 'http') === false) {
      $link->attr('href', $base . '/' . ltrim($link->attr('href'), '/'));
    }
  }
  foreach (qp($page, 'script') as $script) {
    if ($script->hasAttr('src') && strpos($script->attr('src'), 'http') === false) {
      $script->attr('src', $base . '/' . ltrim($script->attr('src'), '/'));
    }
  }
  foreach (qp($page, 'style') as $style) {
    if (preg_match('/@import ?["\']\/(.*)/', $style->text(), $matches) && count($matches) > 1) {
      $style->text('@import "' . $base . '/' . $matches[1]);
    }
  }
  foreach (qp($page, 'img') as $img) {
    if (strpos($img->attr('src'), 'http') === false) {
      $img->attr('src', $base . '/' . ltrim($img->attr('src'), '/'));
    }
  }

  // Rewrite same-domain URLs to run through InBedify.
  foreach (qp($page, 'a') as $a) {
    if (strpos($a->attr('href'), 'http') !== false) {
      // Only rewrite same-domain URLs.
      $host = parse_url($a->attr('href'), PHP_URL_HOST);
      if ($host == $url_parts['host']) { //@todo
        $a->attr('href', 'http://withbaconfy.com/' . $a->attr('href'));
      }
    }
    else {
      // Relative URL.
      $a->attr('href', 'http://withbaconfy.com/' . $base . '/' . ltrim($a->attr('href'), '/'));
    }
  }

  // InBedify!
  // Speed this up @todo
  foreach (qp($page, 'h1') as $header) {
    inBedElement($header);
  }
  foreach (qp($page, 'h2') as $header) {
    inBedElement($header);
  }
  foreach (qp($page, 'h3') as $header) {
    inBedElement($header);
  }

  print $page->html();
  exit;
}
Exemple #25
0
 /**
  * Downloads html and temporary saves them in debug mode
  *
  * @param string $url
  * @param array $options
  *
  * @return QueryPath
  */
 function getContent($url, $options = [])
 {
     if (App::config("debug")) {
         $fileCache = App::config("tmp") . md5($url);
         if (file_exists($fileCache)) {
             $html = file_get_contents($fileCache);
             if (isset($options["encoding"])) {
                 $html = $this->fixEncoding($html);
             }
             return htmlqp($html);
         }
     }
     $html = Utils::curl($url);
     // lower string size
     if (isset($options["onlyBody"])) {
         if (strpos('<body>', $html) !== false) {
             $tmp = explode('<body>', $html);
             $tmp = explode('</body>', $tmp[1]);
             $html = $tmp[0];
         }
     }
     if (App::config("debug")) {
         file_put_contents($fileCache, $html);
     }
     if (isset($options["encoding"])) {
         $html = $this->fixEncoding($html);
     }
     return htmlqp($html);
 }
Exemple #26
0
 protected function processItems()
 {
     //ensure that we are not appending to old data (i.e. if this method is called more than once)
     $this->SetPostingsToEmpty();
     $entirepage = htmlqp($this->GetRequestData());
     $metadata = $entirepage->find('.documentBottomLine')->children('.documentByLine');
     $author = $this->getAuthor($metadata->text());
     $date = $this->convertDate($metadata->text());
     $link = $this->source;
     if ($this->heading === false) {
         $items = $entirepage->find('h1.documentFirstHeading');
         foreach ($items as $item) {
             $text = $this->tidyText($this->prependText($item->text()));
             $this->AppendToPostings($date, $author, $text, $link);
         }
     } else {
         $this->AppendToPostings($date, $author, $this->prependText($this->heading), $link);
     }
 }
Exemple #27
0
 protected function GetDocumentFromURL(string $URL) : QueryPath\DOMQuery
 {
     /*//
     	download the page from the specified thing and attempt to parse it as
     	a valid html thing.
     	//*/
     $this->PrintLine(">> Fetching {$URL}");
     $HTML = file_get_contents($URL);
     if (!$HTML) {
         throw new Exception("unable to fetch {$URL}");
     }
     ////////
     $Document = @htmlqp($HTML);
     if (!$Document) {
         throw new Exception("unable to parse {$URL}");
     }
     ////////
     return $Document;
 }
Exemple #28
0
 function get_url_suggestions()
 {
     if (!isset($this->user) || !isset($this->user_company)) {
         set_status_header(401);
         return;
     }
     require_once APPPATH . 'third_party/querypath-3.0.4/src/qp.php';
     $data = [];
     if ($url = $this->input->get("url")) {
         libxml_use_internal_errors(true);
         $qp = htmlqp($url);
         //Check GA
         $html = strtolower($qp->html());
         if (strpos($html, 'www.google-analytics.com/analytics.js') !== FALSE) {
             $data['ga'] = 1;
         } elseif (strpos($html, 'www.google-analytics.com/ga.js') !== FALSE) {
             $data['ga'] = 2;
         }
         // Check Author
         $author1 = $qp->find(".author");
         if ($author1->count() == 1) {
             $data['author_text'] = $author1->text();
             $data['author_class'] = '.author';
         }
         if (!isset($data['author_class'])) {
             $author2 = $qp->find("meta[name=author]");
             if ($author2->count() == 1) {
                 $data['author_text'] = $author2->attr('content');
                 $data['author_class'] = 'meta[name=author]';
             }
         }
         if (!isset($data['author_class'])) {
             $author3 = $qp->find("*[class*='author']");
             $classes = [];
             foreach ($author3->get() as $el) {
                 $classes[] = $el->getAttribute('class');
             }
             $classes = array_count_values(array_map('strtolower', $classes));
             foreach ($classes as $class => $count) {
                 if ($count == 1) {
                     $data['author_text'] = $qp->find('.' . str_replace(' ', '.', $class))->text();
                     $data['author_class'] = '.' . str_replace(' ', '.', $class);
                     break;
                 }
             }
         }
         //check URL
         $url1 = $qp->find("meta[property='og:url']");
         if ($url1->count() >= 1) {
             $data['url_text'] = $url1->attr('content');
             $data['url_option'] = 1;
         }
         if (!isset($data['url_option'])) {
             $url2 = $qp->find("link[rel='canonical']");
             if ($url2->count() >= 1) {
                 $data['url_text'] = $url2->attr('href');
                 $data['url_option'] = 2;
             }
         }
         if (!isset($data['url_option'])) {
             $data['url_text'] = $this->input->get("url");
             $data['url_option'] = 3;
         }
     } else {
         $data["error"] = "We couldn't retrieve the page.";
     }
     $this->output->set_content_type('application/json')->set_output(json_encode($data));
 }
Exemple #29
0
<?php

#require_once APPPATH."/third_party/querypath-2.1.2/QueryPath/QueryPath.php";
require_once APPPATH . "/libraries/QueryPath2.php";
#qp('http://127.0.0.1:8090/common/main/sndmail_00700211')->find('test1')->text('Hello World')->writeHTML();
function xmp_print($arr)
{
    echo '<xmp>';
    print_r($arr);
    echo '</xmp>';
}
$html = '<!DOCTYPE html>
<html>
        <head>
                <title>예제</title>
        </head>
        <body>
                <p class="a" id="test1">다람쥐 헌 쳇바퀴<br>타고파.</p>
                <p class="a" id="test2">다람쥐가노래를한<b>다</b>
                람쥐.</p>
                <p>다람쥐</p>
        </body>
</html>';
$children = htmlqp($html, 'body', array('convert_to_encoding' => 'utf-8'))->children('p.a');
foreach ($children as $child) {
    $node = $child;
    xmp_print($node);
}
Exemple #30
0
/**
 * Given a QTI XML file, extract questions.
 */
function qti_extract_info($file) {

	$items = array();
	foreach(qp($file, 'item') as $item) {

		//Get negative score
		$negative_score = $item->branch()->xpath('/questestinterop/item/resprocessing/respcondition/conditionvar/not/varequal/../../../setvar')->text();

		//Handle feedback
		$feedback_incorrect_linkrefid = $item->branch()->xpath('/questestinterop/item/resprocessing/respcondition/setvar[text()<=0]/../displayfeedback/@linkrefid')->text();

		$feedback_incorrect =  node_to_text($item->branch()->xpath('/questestinterop/item/itemfeedback[@ident="' . $feedback_incorrect_linkrefid . '"]/material'), true);

		$title = $item->attr('title');
		$type = $item->find('itemmetadata>qmd_itemtype')->text();

		$body = $item->end()->find('presentation>material');

		if ($body->attr('texttype') == 'text/html') {

			$bodytext = $body->text();
			
			if (strpos($bodytext, '<html') === FALSE) {
				$bodytext =  '<html>' . $bodytext . '</html>';
			}

			$doc = new DOMDocument();
			//supress query path warnings.
			@$doc->loadHTML($bodytext);
			$html = htmlqp($doc, 'body');
			//Handles emphasized text
			$contents = $html->get(0)->childNodes;
			// Extract HTML content
			$newdoc = qp();
			$i = 0;
			while ($node = $contents->item($i++)) {
				$newdoc->append($contents);
			}
			$out = strip_tags($newdoc->html()); // This leaves off XML declaration.
		}
		else {
			//$out = strip_tags($body->text());
			$out = node_to_text($body, false);
		}
		
		$new_item = array(
			'title' => $title,
			'type' => $type,
			'content' => $out,
			'answers' => array()
			);
		
		$answers = array();
		// Get all answers and loop through them.
		$answerstexts = $item->parent('item')->find('response_lid>render_choice>response_label>material>mattext');
		$num_of_correct_answers = 0;
		$answers = array();

		foreach ($answerstexts as $answertext) {

			$text = $answertext->text();
			$index = $answertext->parent('response_label')->attr('ident');
			$filter_weight = 'resprocessing>respcondition>conditionvar>varequal:contains(' . $index . ')';
			$weight = $answertext->parent('item')->find($filter_weight)->parent('respcondition')->find('setvar')->text();    
			$index_feedback = $answertext->end()->parent('item')->find($filter_weight)->parent('respcondition')->find('displayfeedback')->attr('linkrefid');  
			$filter_feedback = '//itemfeedback[@ident="' . $index_feedback . '"]';
			$feedback = "";
			$feedback = $answertext->end()->parent('item')->xpath($filter_feedback)->text();

			if($weight == '') {
				$feedback = $feedback_incorrect;
				$weight = $negative_score;
			}

			$is_correct = false;

			if($weight > 0) {
				$is_correct = true;
				$num_of_correct_answers++;
			}

			$answers[] = array(
				'text' => $text,
				'index' => $index,
				'is_correct' => $weight>0,
				'feedback' => $feedback,
				'weight' => $weight
				);

			//Store answers
			$new_item['answers'] = $answers;
		}
		$new_item['num_of_correct_answers'] = $num_of_correct_answers;

		// Store questions
		$items[] = $new_item;
	}
	return $items;
}