find() публичный Метод

Paperg - allow us to specify that we want case insensitive testing of the value of the selector.
public find ( $selector, $idx = null, $lowercase = false )
Пример #1
0
function getProducts($u, $cat)
{
    global $o;
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    //echo "Loaded URL: " . $u . "\n";
    $items = $d->find('li.grid-item');
    if (count($items) > 0) {
        foreach ($items as $p) {
            $prod = $p->find('p.product-name > a', 0);
            $prodname = trim($prod->innertext);
            $prodURL = $prod->href;
            if (!is_null($p->find('p.minimal-price', 0))) {
                $prodtype = 1;
            } else {
                $prodtype = 0;
            }
            fputcsv($o, array($prodname, $prodtype, $cat, $prodURL));
            echo $prodname . "\n";
        }
        if (!is_null($d->find('p.next', 0))) {
            getProducts($d->find('p.next', 0)->href, $cat);
        }
    }
}
Пример #2
0
 private function parsing($scrappedData)
 {
     $result = [];
     //Create a DOM parser object
     $html = new simple_html_dom();
     //Parse the HTML from Amazon.
     $html->load($scrappedData);
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         //total page
         foreach ($html->find('span[class=pagnDisabled]') as $maxPage) {
             $atmp['totalPage'] = $maxPage->innertext();
         }
         # Show the <a href>
         if (isset($atmp)) {
             $result[$key] = $atmp;
         }
     }
     return $this->aResult = $result;
 }
function clubURL($url)
{
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $clubName = trim(str_replace('&nbsp;', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace('&nbsp;', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            }
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
        }
    }
    /*
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak
     */
    $dom->clear();
    unset($dom);
}
function run_ml($q_num = 0)
{
    $html = scraperWiki::scrape("http://musiklegal.com/search/result/a/" . $q_num);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("tr") as $data) {
        $tds = $data->find("td");
        $temp_data = explode('">', str_replace('</<strong>a</strong>>', '', str_replace('<<strong>a</strong> href="http://musiklegal.com/song/detail/', '', $tds[1]->plaintext)));
        $record = array('No' => str_replace('.', '', $tds[0]->plaintext), 'Code' => $temp_data[0], 'Song Title' => $temp_data[1], 'Artist' => $tds[2]->plaintext, 'Album' => $tds[3]->plaintext);
        /*
         *  Stores results
         */
        scraperwiki::save_sqlite(array("No"), $record);
        unset($temp_data);
    }
    foreach ($dom->find("a") as $a) {
        if ($a->plaintext == 'Next') {
            $tmp_a = $a->href;
            $tmp_a = str_replace('http://musiklegal.com/search/result/a/', '', $tmp_a);
            if ($tmp_a > 0) {
                continue;
            }
        }
    }
    if ((int) $tmp_a != 0) {
        run_ml($tmp_a);
    } else {
        exit;
    }
}
Пример #5
0
function getCategories($u)
{
    global $baseurl, $f;
    $path = "";
    $d = new simple_html_dom();
    $d->load(scraperwiki::scrape($u));
    echo "Loaded URL: " . $u . "\n";
    if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) {
        $breadcrumb = $d->find('div[id=breadcrumb]', 0);
        //foreach($breadcrumb as $b) {
        //echo "Breadcrumb = " . $b;}
        if (!is_null($breadcrumb)) {
            foreach ($breadcrumb->children() as $crumb) {
                $path .= trim($crumb->innertext) . "/";
            }
            $path .= trim(strrchr($breadcrumb->innertext, ">"), "> ");
        }
        foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) {
            $name = trim(strstr($div->children(0)->innertext, "(", true));
            $url = $baseurl . $div->children(0)->href;
            $data = array("Name" => $name, "Path" => $path, "URL" => $url);
            echo $path . "/" . $name . "\n";
            if ($local) {
                fputcsv($f, array($name, $path, $url));
            } else {
                scraperwiki::save_sqlite(array("URL"), $data);
            }
            getCategories($url);
        }
    }
}
Пример #6
0
 /**
  * Compile a template file by reading it, converting the DOM using
  * {@see convert()}, then applying macros using {@see transform()}.
  * @param string $template Template file path.
  * @return string PHP template content. 
  * @throws InvalidTemplateException If template is inaccessible or invalid.
  */
 public function compile($template)
 {
     $dom = new \simple_html_dom();
     $this->currentTemplate = $template;
     $file = file_get_contents($template);
     if ($file === false) {
         throw new InvalidTemplateException(tr('Could not read template: %1', $template));
     }
     if (!$dom->load($file, true, false)) {
         throw new InvalidTemplateException(tr('Could not parse template: %1', $template));
     }
     $root = new InternalNode();
     $main = $dom->find('[j:main]', 0);
     if (isset($main)) {
         $root->append($this->convert($main));
     } else {
         foreach ($dom->find('*, text') as $html) {
             if ($html->parent->tag != 'root') {
                 continue;
             }
             $root->append($this->convert($html));
         }
     }
     $this->transform($root);
     return $root->__toString();
 }
Пример #7
0
function getCardInfo($url)
{
    $baseURL = 'http://gatherer.wizards.com/Pages/Card/';
    $html = scraperWiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $cardImage = $dom->find('img[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage]', 0)->src;
    $cardImage = str_replace("amp;", "", $cardImage);
    $imgURL = $baseURL . $cardImage;
    $name = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow] div[class=value]', 0)->plaintext;
    $name = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $name);
    $mana = "";
    $manaImages = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow] div[class=value] img');
    foreach ($manaImages as $manaItem) {
        $mana .= substr($manaItem->alt, 0, 1);
    }
    $mana = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $mana);
    $cmc = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow] div[class=value]', 0);
    $cmc = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cmc);
    $type = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow] div[class=value]', 0);
    $type = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $type);
    $text = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow] div[class=value]', 0);
    $text = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $text);
    $flavor = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_flavorRow] div[class=value]', 0);
    $flavor = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $flavor);
    $cardNumber = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow] div[class=value]', 0);
    $cardNumber = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $cardNumber);
    $artist = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_artistRow] div[class=value]', 0);
    $artist = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $artist);
    $rarity = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow] div[class=value]', 0);
    $rarity = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $rarity);
    $set = $dom->find('div[id=ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow] div[class=value]', 0);
    $set = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $set);
    scraperwiki::save_sqlite(array("card"), array("Card" => trim($name), "Image" => $imgURL, "Mana" => trim($mana), "CMC" => trim($cmc), "Type" => trim($type), "Card Text" => trim($text), "Flavor Text" => trim($flavor), "Artist" => trim($artist), "Card Number" => trim($cardNumber), "Rarity" => trim($rarity), "Expansion" => trim($set)));
}
Пример #8
0
 public static function absolutizeHtml($sBaseUrl, $sHtml)
 {
     $oHtml = new simple_html_dom();
     $oHtml->load($sHtml);
     $aTags = $oHtml->find('a');
     foreach ($aTags as $oTag) {
         $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href);
     }
     $aTags = $oHtml->find('img');
     foreach ($aTags as $oTag) {
         $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src);
     }
     $aTags = $oHtml->find('script');
     foreach ($aTags as $oTag) {
         $oTag->src = self::absolutizeUrl($sBaseUrl, $oTag->src);
     }
     $aTags = $oHtml->find('link');
     foreach ($aTags as $oTag) {
         $oTag->href = self::absolutizeUrl($sBaseUrl, $oTag->href);
     }
     // Parse url() in inline css
     $aTags = $oHtml->find('style');
     foreach ($aTags as $oTag) {
         $oTag->innertext = preg_replace_callback('|url\\s*\\(\\s*[\'"]?([^\'"\\)]+)[\'"]?\\s*\\)|', function ($aMatches) use($sBaseUrl) {
             return 'url("' . trim(self::absolutizeUrl($sBaseUrl, $aMatches[1])) . '")';
         }, $oTag->innertext);
     }
     return $oHtml . '';
 }
Пример #9
0
 /**
  * curl 访问 开奖数据
  */
 private function get_data()
 {
     include_once 'simplehtmldom_1_5/simple_html_dom.php';
     $simple_html_dom = new \simple_html_dom();
     //zlib 解压 并转码
     $data = false;
     $data = @file_get_contents("compress.zlib://" . self::URL);
     if (!$data) {
         $this->setLog(false, '重庆时时彩-开奖数据抓取失败');
         exit('重庆时时彩-数据抓取失败,请尽快联系网站管理员' . "\r\n");
     }
     //转换成 UTF-8编码
     $encode = mb_detect_encoding($data, array('ASCII', 'UTF-8', 'GB2312', "GBK", 'BIG5'));
     $content = iconv($encode, 'UTF-8', $data);
     $simple_html_dom->load($content);
     //开奖期号
     $qihao = $simple_html_dom->find('div[class=aside]', 0)->find('h3', 0)->find('em', 0)->plaintext;
     //开奖号
     $code = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 1)->plaintext;
     if ($code == '--') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $isKaiJiang = $simple_html_dom->find('div[class=aside]', 0)->find('div[class=mod-aside mod-aside-xssckj]', 0)->find('div[class=bd]', 0)->find('div[class=kpkjcode]', 0)->find('table', 0)->find('tr', 1)->find('td', 2)->plaintext;
     if ($isKaiJiang == '--' && $isKaiJiang == '开奖中') {
         exit('重庆时时彩-等待开奖...' . "\r\n");
     }
     $simple_html_dom->clear();
     //将开奖号中间的空格去掉
     $code = str_replace(" ", '', $code);
     //开奖时间
     $kjsj = date('Y-m-d H:i:s');
     $this->data = ['qihao' => $qihao, 'kjsj' => $kjsj, 'code' => $code];
 }
Пример #10
0
function scraper($url_search, $country_id)
{
    $has_next = false;
    $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet";
    $html = scraperwiki::scrape($url_search);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find('table[class=JResult]') as $result) {
        foreach ($result->find('td[class=JRTitle] a') as $job_page) {
            $chars = explode("'", $job_page->onclick);
            $url_job = $base_url . substr($chars[1], 1);
            $url_id = strstr($url_job, 'uniqueJvId=');
            $url_id = str_replace('uniqueJvId=', "", $url_id);
            echo "JOB: " . $url_job . "<br />";
        }
        foreach ($result->find('th') as $data) {
            $text = trim($data->plaintext);
            if ($text == 'Description:') {
                $description = trim($data->next_sibling()->plaintext);
                echo "DESCRIPTION: " . $description . "<br />";
            }
            if ($text == 'Source:') {
                $source = trim($data->next_sibling()->plaintext);
                $source = str_replace("'", "\\'", $source);
                if ($source != '' && $source != '&nbsp;') {
                    $source_id = insert_name('source', $source);
                    echo "SOURCE: " . $source . "<br /><br />";
                }
            }
        }
        $description = str_replace("'", "\\'", $description);
        $description = str_replace("</BR>", "", $description);
        $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'");
        $cont = mysql_num_rows($sql);
        if ($cont == 0) {
            mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()");
        } else {
            echo "Job URL already extracted: " . $url_job . "<br /><br />";
        }
    }
    foreach ($dom->find('div[class=prevNext] a') as $next_page) {
        $text = $next_page->plaintext;
        if ($text == "Next page") {
            $url_next = substr($next_page->href, 1);
            $url_next = $base_url . $url_next;
            $has_next = true;
            print "<br /><br />NEXT: " . $url_next . "<br /><br />";
        }
    }
    unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search);
    //Comment this for tests, uncomment this to get all data
    //	if ($has_next == true){
    //		sleep(1);
    //		scraper($url_next, $country_id);
    //	}
}
Пример #11
0
 /**
  * 创建表结构
  * @param $configs
  */
 private static function createTables($configs)
 {
     $tables = self::$XML->find("table");
     foreach ($tables as $value) {
         $tableName = $configs["table-prefix"] . $value->name;
         self::query("DROP TABLE IF EXISTS `{$tableName}`");
         $sql = "CREATE TABLE `{$tableName}`(";
         $pk = $value->find("pk", 0);
         if ($pk) {
             $sql .= "`{$pk->name}` {$pk->type} NOT NULL ";
             if ($pk->ai) {
                 $sql .= "AUTO_INCREMENT ";
             }
             $sql .= "COMMENT '主键',";
         }
         //添加字段
         $fields = $value->find("fields", 0);
         if ($fields) {
             foreach ($fields->children() as $fd) {
                 if ($fd->default || $fd->default === "0") {
                     //has default value
                     if (in_array($fd->default, self::$DEFAULT_VALUE_KEYWORD)) {
                         $sql .= "`{$fd->name}` {$fd->type} NOT NULL DEFAULT {$fd->default} COMMENT '{$fd->comment}',";
                     } else {
                         $sql .= "`{$fd->name}` {$fd->type} NOT NULL DEFAULT '{$fd->default}' COMMENT '{$fd->comment}',";
                     }
                 } else {
                     //has not default value
                     $sql .= "`{$fd->name}` {$fd->type} NOT NULL COMMENT '{$fd->comment}',";
                 }
                 //创建索引
                 if ($fd->getAttribute("add-index") == "true") {
                     $indexType = $fd->getAttribute("index-type");
                     if ($indexType == "normal") {
                         $sql .= "KEY `{$fd->name}` (`{$fd->name}`), ";
                     } elseif ($indexType == "unique") {
                         $sql .= "UNIQUE KEY `{$fd->name}` (`{$fd->name}`),";
                     }
                 }
             }
         }
         if ($pk) {
             $sql .= "PRIMARY KEY (`{$pk->name}`)";
         }
         $sql .= ") ENGINE={$value->engine}  DEFAULT CHARSET={$configs['charset']} COMMENT='{$value->comment}' AUTO_INCREMENT=1 ;";
         if (self::query($sql) !== false) {
             tprintOk("create table '{$tableName}' successfully.");
         } else {
             tprintError("create table '{$tableName}' faild.");
             tprintError(self::$DB_CONN->error);
         }
     }
 }
 public function find($url)
 {
     # sanitize url
     $url = strip_tags($url);
     $url = urldecode($url);
     # Remove the end character
     if ($url[strlen($url) - 1] == '/') {
         $url = substr($url, 0, strlen($url) - 1);
     }
     # test if 'http://' is present
     if (strpos($url, 'http://') !== 0) {
         $url = 'http://' . $url;
     }
     # Tabs of rss feeds urls
     $results = array();
     # Try to load the content of url
     $content = @file_get_contents($url);
     if (!empty($content)) {
         $html = new simple_html_dom();
         $html->load($content, true);
         # Check if it is an url to feeds
         if (count($html->find('channel')) > 0) {
             $results[] = array('href' => $url, 'title' => 'RSS');
         } else {
             if (count($html->find('feed')) > 0) {
                 $results[] = array('href' => $url, 'title' => 'Atom');
             } else {
                 # Get links markup
                 $links = $html->find('link');
                 foreach ($links as $link) {
                     $href = $link->href;
                     $title = $link->title;
                     # Fix url to avoid path errors
                     if ($href[0] == '/') {
                         #
                         $href = $url . $href;
                     }
                     # Get the resource type
                     $type = $link->type;
                     # Save only feeds
                     if (!in_array($href, $results) && (ereg('application.*rss', $type) || ereg('application.*atom', $type))) {
                         $results[] = array('href' => $href, 'title' => $title);
                     }
                 }
             }
         }
     }
     # transform results tab into json tab
     echo json_encode($results);
 }
Пример #13
0
 public function get_images($chapter_url, $prefix, $infix)
 {
     $ifx = Text::create($infix)->pad(3)->to_s();
     $p = new Page($chapter_url);
     $h = new simple_html_dom();
     $h->load($p->content());
     $select = $h->find('select[name="pagejump"]', 0);
     $img = $h->find('#page', 0)->find('img', 0);
     $srcdir = dirname($img->src);
     $pages = array();
     foreach ($select->find('option') as $opt) {
         $pages["{$prefix}-{$infix}-{$opt->value}.jpg"] = $srcdir . '/' . $opt->value . '.jpg';
     }
     return $pages;
 }
function scrappe_offre($html, $reference)
{
    $dom = new simple_html_dom();
    $dom->load($html);
    $div = $dom->find("div.tx-sqliwebServiceanpe-pi5");
    $span = $dom->find("div.tx-sqliwebServiceanpe-pi5 span.texteANPEDetail");
    $actualisation = $span[1]->plaintext;
    foreach ($span as $data) {
        $tds = $data->find("td");
        $record = array('actualiseJJ' => intval($actualisation[29] . $actualisation[30]), 'actualise le' => $actualisation, 'type_contrat' => $span[5]->plaintext, 'analyse_type_contrat' => "", 'experiance' => $span[6]->plaintext);
        print json_encode($record) . "\n";
        #scraperwiki::save(array('contenu_offre'), $record);
    }
    #return ;
}
Пример #15
0
function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape(BASE_URL . "?type={$param}");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("td");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
Пример #16
0
 /**
  * Goes directly to site if site doesn't return error, search is found.
  *
  * @return bool
  */
 public function search()
 {
     $result = false;
     if (!empty($this->searchTerm)) {
         $this->_title = $this->searchTerm;
         // Remove periods, underscored, anything between parenthesis.
         $this->searchTerm = preg_replace('#\\(.*?\\)|[-._]#i', ' ', $this->searchTerm);
         // Remove multiple spaces and trim leading spaces.
         $this->searchTerm = trim(preg_replace('#\\s{2,}#', ' ', $this->searchTerm));
         // Replace whitespace with a - for desura game urls
         $this->searchTerm = preg_replace('#\\s#', '-', strtolower($this->searchTerm));
         if ($this->getUrl(self::DESURAURL . '/games/' . $this->searchTerm) !== false) {
             if (!preg_match('#(Games system error)#i', $this->_response)) {
                 if ($this->_ret = $this->_html->find("a#watchtoggle", 0)) {
                     if (preg_match('#siteareaid=(?<gameid>\\d+)#', $this->_ret->href, $matches)) {
                         $this->_desuraGameID = $matches['gameid'];
                         $this->_directURL = self::DESURAURL . '/games/' . $this->searchTerm;
                         $result = true;
                     }
                 }
             }
         }
     }
     return $result;
 }
Пример #17
0
 /**
  * Searches for match against searchterm
  * @return bool, true if search >= 90%
  */
 public function search()
 {
     $result = false;
     if (isset($this->searchTerm)) {
         $this->_trailUrl = self::TRAILINGSEARCH . urlencode($this->searchTerm);
         if ($this->getUrl() !== false) {
             if ($ret = $this->_html->find('div.product-info, div.title', 1)) {
                 $this->_title = trim($ret->plaintext);
                 $title = preg_replace('/XXX/', '', $ret->plaintext);
                 $title = preg_replace('/\\(.*?\\)|[-._]/i', ' ', $title);
                 $title = trim($title);
                 if ($ret = $ret->find('a', 0)) {
                     $this->_trailUrl = trim($ret->href);
                     if ($this->getUrl() !== false) {
                         if ($ret = $this->_html->find('#link-to-this', 0)) {
                             $this->_directUrl = trim($ret->href);
                         }
                         similar_text(strtolower($this->searchTerm), strtolower($title), $p);
                         if ($p >= 90) {
                             $result = true;
                         }
                     }
                 }
             }
         }
     }
     return $result;
 }
Пример #18
0
 /**
  * Searches for match against searchterm
  * @return bool - true if search = 100%
  */
 public function search()
 {
     $result = false;
     if (isset($this->searchTerm)) {
         $this->_trailUrl = self::TRAILINGSEARCH . urlencode($this->searchTerm);
         if ($this->getUrl() !== false) {
             if ($ret = $this->_html->find('img[rel=license]')) {
                 if (count($ret) > 0) {
                     foreach ($this->_html->find('img[rel=license]') as $ret) {
                         if (isset($ret->alt)) {
                             $title = trim($ret->alt, '"');
                             $title = preg_replace('/XXX/', '', $title);
                             $comparetitle = preg_replace('/[^\\w]/', '', $title);
                             $comparesearch = preg_replace('/[^\\w]/', '', $this->searchTerm);
                             similar_text($comparetitle, $comparesearch, $p);
                             if ($p == 100) {
                                 if (preg_match('/\\/(?<sku>\\d+)\\.jpg/i', $ret->src, $matches)) {
                                     $this->_title = trim($title);
                                     $this->_trailUrl = "/dvd_view_" . (string) $matches['sku'] . ".html";
                                     $this->_directUrl = self::ADMURL . $this->_trailUrl;
                                     if ($this->getUrl() !== false) {
                                         $result = true;
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     return $result;
 }
Пример #19
0
 public function parse($isUpdate = false)
 {
     Ibos::import("application.extensions.simple_html_dom", true);
     if ($isUpdate) {
         $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel);
         $max = 0;
     } else {
         $model = $this->printmodel;
         $max = intval($this->itemmax);
     }
     $elements = array();
     $doc = new simple_html_dom();
     $doc->load($model, true, true, CHARSET);
     $items = $doc->find("ic");
     $config = $this->getItemConfig();
     if (!empty($items) && !empty($config)) {
         $this->refactor($items, $config, $max, $elements);
     }
     $html = $doc->save();
     $this->_cache = $elements;
     CacheUtil::set("form_" . $this->ID, $elements);
     $form["printmodelshort"] = $html;
     if ($max != $this->itemmax) {
         $form["itemmax"] = $max;
     }
     $doc->clear();
     FlowFormType::model()->modify($this->ID, $form);
 }
Пример #20
0
 protected function parsing($input)
 {
     include_once "inc/simple_html_dom.php";
     # Create a DOM parser object
     $html = new simple_html_dom();
     # Parse the HTML from Amazon.
     $html->load($input);
     $result = [];
     # Iterate over all the  tags
     foreach ($html->find('li[class=s-result-item]') as $key => $innerData) {
         //image
         foreach ($innerData->find('img[class=s-access-image]') as $img) {
             $atmp['image'] = $img->getAttribute('src');
         }
         //title
         foreach ($innerData->find('h2[class=s-access-title]') as $title) {
             $atmp['title'] = $title->innertext();
         }
         //price
         foreach ($innerData->find('span[class=s-price]') as $price) {
             $price = $price->innertext();
             $atmp['price'] = $price;
             $atmp['numPrice'] = str_replace(",", '', substr($price, 1));
         }
         # Show the <a href>
         $result[$key] = $atmp;
     }
     if (!empty($result)) {
         return $this->aResult = $result;
     }
 }
Пример #21
0
 /**
  * Private function of obtaining the simple html dom object with the html loaded in it
  * @param type $html
  * @return $html_dom_array Array of simple_html_dom tags
  */
 private function &_getHtmlDomArray($html)
 {
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     $html_dom_array = $html_dom->find('html', 0)->children();
     return $html_dom_array;
 }
Пример #22
0
function read_listing($params, $url = 'http://www.auto24.ee/kasutatud/nimekiri.php')
{
    $endpoint = build_query($url, $params);
    $html = scraperWiki::scrape($endpoint);
    $dom = new simple_html_dom();
    $dom->load($html);
    $totalResultsEl = $dom->find('.paginator .current-range strong');
    $totalResults = $totalResultsEl[0]->plaintext;
    $medianItem = ($totalResults + 1) / 2;
    if ($medianItem > RESULTS_PER_PAGE) {
        $listingOffset = floor($medianItem / RESULTS_PER_PAGE) * RESULTS_PER_PAGE;
        $params['ak'] = $listingOffset;
        $medianItem -= $listingOffset;
        $endpoint = build_query($url, $params);
        $html = scraperWiki::scrape($endpoint);
        $dom = new simple_html_dom();
        $dom->load($html);
    }
    $rows = $dom->find("[@id=usedVehiclesSearchResult] .result-row");
    $lPoint = floor($medianItem) - 1;
    $hPoint = ceil($medianItem) - 1;
    $a24ksi = 0;
    if ($lPoint == $hPoint) {
        $rowData = get_row_data($rows[$lPoint]);
        $a24ksi = $rowData['price'];
    } else {
        $lRowData = get_row_data($rows[$lPoint]);
        $hRowData = get_row_data($rows[$hPoint]);
        $a24ksi = round(($lRowData['price'] + $hRowData['price']) / 2);
    }
    return array('n' => $totalResults, 'val' => $a24ksi);
}
Пример #23
0
 /**
  * 動画のURLを取得する
  *
  * @param  simple_html_dom $html
  * @return array
  **/
 public function getMoviesUrl($html)
 {
     $query = 'div.entryBody div.topmore a img';
     $movies_els = $html->find($query);
     $movie_data = array();
     $manager = new UriManager();
     // 動画はこちらテキストのリンクを取得する
     foreach ($movies_els as $movies_el) {
         if (!preg_match('/^動画.+/', $movies_el->getAttribute('alt'))) {
             continue;
         }
         // 親のaタグからリンクを取得する
         $parent_el = $next_el = $movies_el->parentNode();
         $i = 0;
         while ($i < 3) {
             $next_el = $next_el->nextSibling();
             if (is_null($next_el)) {
                 break;
             }
             $i++;
         }
         if ($next_el->nodeName() == 'span') {
             $movie_data = [];
             break;
         }
         if ($parent_el->nodeName() == 'a') {
             $movie_data[] = $manager->resolve($parent_el->getAttribute('href'));
         }
     }
     return $movie_data;
 }
function scrapeDetails($ngo)
{
    $html_content = scraperwiki::scrape($ngo["url"]);
    $dom = new simple_html_dom();
    $dom->load($html_content);
    $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:');
    // Scrape Details from all paragraphs
    $paragraphs = $dom->find('p');
    foreach ($paragraphs as $p) {
        if (strstr($p->plaintext, "Website")) {
            $ngo["website"] = $p->find('a', 0)->href;
        }
        if (strstr($p->plaintext, "Email")) {
            $ngo["email"] = $p->find('a', 0)->plaintext;
        }
        foreach ($infosWeWant as $key => $info) {
            $res = extractInfo($p, $info);
            if ($res) {
                $ngo[$info] = $res;
                //Do not search for this info again
                unset($infosWeWant[$key]);
            }
        }
    }
    print_r($ngo);
    return $ngo;
}
Пример #25
0
 /**
  * 動画のURLを取得する
  *
  * @param  simple_html_dom $html
  * @return array
  **/
 public function getMoviesUrl($html)
 {
     $query = 'div.ently_body div.ently_text div.video-container iframe';
     $movies_els = $html->find($query);
     $movie_data = array();
     $manager = new UriManager();
     // 動画はこちらテキストのリンクを取得する
     foreach ($movies_els as $movies_el) {
         if ($movies_el->hasAttribute('src')) {
             $url = $manager->resolve($movies_el->getAttribute('src'));
             if ($url !== false) {
                 $movie_data[] = $url;
             }
         }
     }
     $query = 'div.ently_outline div.ently_body a';
     $movies_els = $html->find($query);
     foreach ($movies_els as $movies_el) {
         $text = $movies_el->plaintext;
         if (preg_match('/リンク(/', $text) && $movies_el->hasAttribute('href')) {
             $resolve_url = $manager->resolve($movies_el->getAttribute('href'));
             if ($resolve_url) {
                 $movie_data[] = $resolve_url;
             }
         }
     }
     return $movie_data;
 }
function scrap_yp($last_alphabet = '', $last_page = '')
{
    $alphabet = range('a', 'z');
    if (is_null($last_alphabet) || $last_alphabet == '') {
        $temp_alphabet = scraperwiki::get_var('last_alphabet_loaded');
        if (!is_null($temp_alphabet)) {
            $last_alphabet = $temp_alphabet;
        } else {
            $last_alphabet = 'a';
        }
    }
    if (is_null($last_page) || $last_page == '') {
        $temp_page = scraperwiki::get_var('last_page_loaded');
        if (!is_null($temp_page)) {
            $last_page = $temp_page;
        } else {
            $last_page = 1;
        }
    }
    $yp_base_url = 'http://www.yellowpages.co.id/browse/letter/' . $last_alphabet . '?page=' . $last_page;
    $html = scraperWiki::scrape($yp_base_url);
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("ul.directory-list") as $data) {
        echo $data;
    }
}
Пример #27
0
 /**
  * Get top10 type news.
  *
  * @param $index int
  * @return boolean
  */
 private function getTopTenNews($index)
 {
     if (!$this->htmlDom) {
         return false;
     }
     $i = 0;
     foreach ($this->htmlDom->find('div.newslist') as $element) {
         if ($i != $index) {
             $i++;
             continue;
         } else {
             $iconUrl = static::URL_BASE . ltrim($element->find('dd.desc > img', 0)->src, '/');
             $iconPath = $this->getNewsIconFilePath($iconUrl, static::NEWS_ICON_DIR_TOP10);
             foreach ($element->find('p') as $e) {
                 $urlNode = $e->find('a', 0);
                 $title = $urlNode->plaintext;
                 $title = iconv('GB18030', 'UTF-8', trim($title));
                 $content = '';
                 $url = static::URL_BASE . ltrim($urlNode->href, '/');
                 $id = 0;
                 if (preg_match("/articles\\/([0-9]+).htm/", $url, $matches)) {
                     $id = $matches[1];
                 }
                 if ($title) {
                     $this->workflow->result($id, $url, $title, $content, $iconPath);
                 }
             }
             break;
         }
     }
     return true;
 }
function scrape_NG_news_article($art_url)
{
    $html = scraperWiki::scrape($art_url);
    require_once 'scraperwiki/simple_html_dom.php';
    $dom = new simple_html_dom();
    $dom->load($html);
    foreach ($dom->find("div#page_head h1") as $data) {
        $art_title = $data->innertext;
    }
    foreach ($dom->find("div#page_head h2") as $data) {
        $art_subtitle = $data->innertext;
    }
    $art_text_array = array();
    $art_paragraph_count = 0;
    $art_text_full = "";
    $art_teaser50 = "";
    $art_teaser100 = "";
    foreach ($dom->find("div#content div.article_text p") as $data) {
        $art_paragraph_count++;
        $tmp = str_get_html($data)->plaintext;
        //        $art_text_array[$art_paragraph_count] = $tmp;
        $art_text_full .= $tmp . " #" . $art_paragraph_count . "# ";
        //if ($art_paragraph_count == 1) $art_teaser = $tmp;
    }
    $art_teaserS = word_teaser($art_text_full, 60);
    $art_teaserM = word_teaser($art_text_full, 120);
    /*  print $art_text_full;                             show_article($art_title,$art_subtitle,$art_text_array);
        for($i=0;$i<count($art_text_array);$i++) {        $art_text_full .= $art_text_array[$i]." #".$i."# ";    }
        $art_text_full = $art_text_full->plaintext;       $art_teaser = $art_text_array[0]->plaintext;   */
    // $record = array("Title" => $art_title, "Subtitle" => $art_subtitle, "TeaserS" => $art_teaserS, "TeaserM" => $art_teaserM, "Text" => $art_text_full, "URL" => $art_url);
    $record = array("TeaserM" => $art_teaserM, "URL" => $art_url);
    scraperwiki::save(array('URL'), $record);
    return $record;
}
function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape("http://www.norwegian.no/fly/lavpris/?D_City=CPH&A_City=DUB&TripType=2&D_Day=1&D_Month=201104&R_Day=1&R_Month=201104&AdultCount=1&ChildCount=0&InfantCount=0");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("div");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
Пример #30
0
 /**
  * Searches for a 100% match.
  *
  * @return bool
  */
 public function search()
 {
     $result = false;
     if (!empty($this->searchTerm)) {
         $this->searchTerm = trim($this->searchTerm);
         if ($this->getUrl(self::GREENLIGHTURL . '?searchtext=' . urlencode($this->searchTerm) . self::GREENLIGHTVARS) !== false) {
             if ($ret = $this->_html->find("div.workshopItemTitle")) {
                 if (count($ret) > 0) {
                     foreach ($this->_html->find("div.workshopItemTitle") as $ret) {
                         $this->_title = trim($ret->plaintext);
                         //Sanitize both searchTerm and title for a positive 100% match
                         if ($this->cleanTitles(strtolower($this->_title), strtolower($this->searchTerm)) === true) {
                             if ($ret->parent()->outertext) {
                                 preg_match('#id?=(?<gameid>\\d+)#', $ret->parent()->outertext, $matches);
                                 $this->_greenlightGameID = $matches['gameid'];
                             }
                             $this->_directURL = self::DIRECTGAMEURL . $this->_greenlightGameID;
                             if ($this->getUrl($this->_directURL) !== false) {
                                 $result = true;
                                 break;
                             }
                         } else {
                             $result = false;
                         }
                     }
                 }
             }
         }
     }
     return $result;
 }