function parseModelsPage($brandId, $brandName, $page) { $html_content = scraperwiki::scrape($page); $this->html = str_get_html($html_content); foreach ($this->html->find("div.makers a") as $el) { $img = $el->find('img', 0); $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext; $m['img'] = $img->src; $m['link'] = 'http://www.gsmarena.com/' . $el->href; $m['desc'] = $img->title; $temp = explode('-', $el->href); $m['id'] = (int) substr($temp[1], 0, -4); $m['brand_id'] = $brandId; scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model"); $this->models++; } $pagination = $this->html->find("div.nav-pages", 0); if ($pagination) { $nextPageLink = $pagination->lastChild(); if ($nextPageLink && $nextPageLink->title == "Next page") { $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href); } } $this->html->__destruct(); }
function getHypem($item) { $ch = curl_init("http://hypem.com/item/{$item}?ax=1&ts=1295726809"); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_COOKIESESSION, true); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1"); curl_setopt($ch, CURLOPT_FORBID_REUSE, true); curl_setopt($ch, CURLOPT_COOKIE, "AUTH=" . $_GET["hash"] . ";"); $output = curl_exec($ch); $html = str_get_html($output); $scripts = $html->find('script'); $script = $scripts[1]; $informatie = array(); $value = $script->innertext; for ($i = 0; $i < 15; $i++) { $pat = "/\\'([^\\']*?)\\'/"; $value = str_replace("\\'", ''', $value); $value = str_replace("&", "", $value); preg_match($pat, $value, $matches); $value = str_replace(@$matches[0], '', $value); $str = @$matches[1]; $str = str_replace("…", "...", $str); $informatie[] = $str; } $result = array("id" => "http://hypem.com/item/" . $informatie[1], "blog" => $informatie[3], "secret" => $informatie[7], "artist" => $informatie[9], "song" => $informatie[10], "songurl" => 'http://hypem.com/serve/play/' . $informatie[1] . '/' . $informatie[7] . '.mp3', "duration" => $informatie[4]); return $result; }
function get_product($url, $page = 0) { global $save_folder; $html = curl_get($url); //Загружает страницу товара $dom = str_get_html($html); $article = $dom->find('article', 0); //Берем артикул $str = $article->attr['id']; sscanf($str, 'post-%d', $art); $scripts = $dom->find('script'); foreach ($scripts as $script) { if (strpos($script->src, "script.js")) { $str = "script[src='" . $script->src . "']"; } } $dom->find($str, 0)->outertext = ''; //Ajax запрос $html = get_ajax($art); //Получили данные из ajax $dom2 = str_get_html($html); //Ищем в 1-й странице div куда будем вставлять данные из ajax $dom->find('div[id=order-variables]', 0)->innertext = $dom2; //Сохраняем HTML file_put_contents($save_folder . 'product--' . $page . '.html', $dom); }
public function dispatchLoopShutdown() { if (!Tool::isHtmlResponse($this->getResponse())) { return; } if (!Tool::useFrontendOutputFilters($this->getRequest()) && !$this->getRequest()->getParam("pimcore_preview")) { return; } if (\Pimcore::inDebugMode()) { return; } if ($this->enabled) { include_once "simple_html_dom.php"; $body = $this->getResponse()->getBody(); $html = str_get_html($body); if ($html) { $html = $this->searchForScriptSrcAndReplace($html); $html = $this->searchForInlineScriptAndReplace($html); $body = $html->save(); $html->clear(); unset($html); } $this->getResponse()->setBody($body); } }
public function getDom($url, $post = false) { $f = fopen(CURL_LOG_FILE, 'a+'); // curl session log file if ($this->lastUrl) { $header[] = "Referer: {$this->lastUrl}"; } $curlOptions = array(CURLOPT_ENCODING => 'gzip,deflate', CURLOPT_AUTOREFERER => 1, CURLOPT_CONNECTTIMEOUT => 120, CURLOPT_TIMEOUT => 120, CURLOPT_URL => $url, CURLOPT_SSL_VERIFYPEER => false, CURLOPT_SSL_VERIFYHOST => false, CURLOPT_FOLLOWLOCATION => true, CURLOPT_MAXREDIRS => 9, CURLOPT_RETURNTRANSFER => 1, CURLOPT_HEADER => 0, CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", CURLOPT_COOKIEFILE => COOKIE_FILE, CURLOPT_COOKIEJAR => COOKIE_FILE, CURLOPT_STDERR => $f, CURLOPT_VERBOSE => true); if ($post) { // add post options $curlOptions[CURLOPT_POSTFIELDS] = $post; $curlOptions[CURLOPT_POST] = true; } $curl = curl_init(); curl_setopt_array($curl, $curlOptions); $data = curl_exec($curl); $this->lastUrl = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); // get url we've been redirected to curl_close($curl); if ($this->dom) { $this->dom->clear(); $this->dom = false; } $dom = $this->dom = str_get_html($data); fwrite($f, "{$post}\n\n"); fwrite($f, "-----------------------------------------------------------\n\n"); fclose($f); return $dom; }
protected function parsePageDynamicContent($content) { $html = str_get_html($content); $count = 0; // There are no dynamic widgets... foreach ($widgets = $html->find('.dm_widget') as $widget) { $widgetId = intval(str_replace('dm_widget_', '', $widget->id)); if ($widgetId) { $cache = $widget->find('.dm_widget_cacheable', 0); if ($cache) { $count++; // We have found a dynamic widget... $cache->innertext = sprintf('{#page#%s#page#}{#widget#%s#widget#}', $this->getPage()->getId(), $widgetId); } } } if ($count == 0) { // The page did not had any dynamic content... return null; } $code = $html->innertext; $code = str_replace('{#page#', '<?php echo $helper->renderWidgetInner(array(\'page_id\'=>', $code); $code = str_replace('#page#}{#widget#', ', \'widget_id\'=>', $code); $code = str_replace('#widget#}', ')); ?>', $code); return $code; }
function scrape($url, $path, $parse) { $config = HTMLPurifier_Config::createDefault(); $config->set('Core.Encoding', 'UTF-8'); //encoding of output $config->set('HTML.Doctype', 'XHTML 1.1'); //doctype of output $purifier = new HTMLPurifier($config); $dirty_html = file_get_contents($url); $clean_html = $purifier->purify($dirty_html); $html = str_get_html($clean_html); switch ($parse) { case 'tag': $ret = $html->find($path)->tag; break; case 'outertext': $ret = $html->find($path)->outertext; break; case 'innertext': $ret = $html->find($path)->innertext; break; case 'plaintext': $ret = $html->find($path)->plaintext; break; default: $ret = $html->find($path); break; } // clean up memory $html->clear(); unset($dirty_html); unset($clean_html); unset($html); return $ret; }
public function testRadioInput() { // Create a Salutation $new_field = magic_form_field_radio::factory($this->input_default_name, $this->input_default_label); // Add some options to it. $new_field->add_options(array(1 => 'Test Value 1', 2 => 'Test Value 2')); // Add them to the form $this->magic_form->add_fields($new_field); $html = $this->magic_form->__toString(); //Get HTML Dom $dom = str_get_html($html); $form = $dom->find("//form")[0]; //Find Radio Inputs $test_radio_field1 = $form->find("input[id=" . $this->input_default_name . "-test-value-1]")[0]; $test_radio_field2 = $form->find("input[id=" . $this->input_default_name . "-test-value-2]")[0]; //Check Radio Inputs $this->assertEquals($this->input_default_name, $test_radio_field1->attr['name'], "Radio 1 Name Equals " . $this->input_default_name); $this->assertEquals($this->input_default_name, $test_radio_field2->attr['name'], "Radio 2 Name Equals " . $this->input_default_name); //Check Labels $test_radio_1_field_label = $form->find("label[for=" . $this->input_default_name . "-test-value-1]")[0]; $this->assertEquals($this->input_default_name . "-test-value-1", $test_radio_1_field_label->attr['for'], "Check Label Radio 1"); $this->assertEquals("Test Value 1", $test_radio_1_field_label->innertext(), "Check Label Text Radio 1"); $test_radio_2_field_label = $form->find("label[for=" . $this->input_default_name . "-test-value-2]")[0]; $this->assertEquals($this->input_default_name . "-test-value-2", $test_radio_2_field_label->attr['for'], "Check Label Radio 2"); $this->assertEquals("Test Value 2", $test_radio_2_field_label->innertext(), "Check Label Text Radio 2"); }
private function clean_discourse($discourse) { App::import('Vendor', 'simple_html_dom'); $html = str_get_html($discourse['Discourse']['content']); foreach ($html->find('div.pagebreak') as $pagebreak) { $anchor_name = $pagebreak->find('a[name]', 0)->name; $pdf_text = $pagebreak->find('a[target="_pdfwin"]', 0)->href; $pdf_image = $pagebreak->find('a[target="_pdfwin2"]', 0)->href; $pagebreak->innertext = '<a name="' . $anchor_name . '"></a><a class="pdf-image" href="' . $pdf_image . '"></a>'; $pagebreak->tag = 'span'; } foreach ($html->find('div.columnbreak') as $columnbreak) { $anchor_name = $columnbreak->find('a[name]', 0)->name; $pdf_text = $columnbreak->find('a[target="_pdfwin"]', 0)->href; $pdf_image = $columnbreak->find('a[target="_pdfwin2"]', 0)->href; $columnbreak->innertext = '<a name="' . $anchor_name . '"></a><a class="pdf-image" href="' . $pdf_image . '"></a>'; $columnbreak->tag = 'span'; } foreach ($html->find('div.paragraph') as $paragraph) { $paragraph->tag = "p"; $paragraph->class = null; } foreach ($html->find('div.hyphen') as $hyphen) { $hyphen->tag = "span"; } $this->Discourse->id = $discourse['Discourse']['id']; $this->Discourse->saveField('content', $html); }
function dados_telefone($num) { $num = preg_replace("/[^\\d]/", "", $num); if (strlen($num) < 10) { return false; } else { include_once 'src/simple_html_dom.php'; $query = http_build_query(array('tel' => $num)); $options = array('http' => array('header' => "Content-Type: application/x-www-form-urlencoded\r\n" . "Content-Length: " . strlen($query) . "\r\n" . "User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36\r\n", 'method' => "POST", 'content' => $query)); $context = stream_context_create($options); $result = file_get_html("http://consultanumero.info/consulta", false, $context); $resultado = @$result->find('div[class=resultado]', 0)->children(1)->outertext; if (empty($resultado)) { return false; } else { $resultado = str_get_html($resultado); $img = @$result->find('div[class=a]', 0)->children(0)->outertext; preg_match('%<img.*?title=["\'](.*?)["\'].*?/>%i', $img, $operadora); $data['operadora'] = $operadora[1]; $data['tipo'] = substr(strrchr(strip_tags($resultado->find('p', 0)->outertext), ' » '), 1); $data['portabilidade'] = strtolower(substr(strrchr(strip_tags($resultado->find('p', 1)->outertext), ' » '), 1)) == 'sim' ? "Sim" : "Não"; $data['estado'] = str_replace(array('(', ')'), '', substr(strrchr(strip_tags($resultado->find('p', 2)->outertext), ' » '), 1)); $cidade = explode(' » ', strip_tags($resultado->find('p', 3)->outertext)); $data['cidade'] = $cidade[1]; return $data; } } }
function processPage($pageContent) { try { $links = array(); $html = str_get_html($pageContent); //reparam html stricat if (!$html->find('body', 0, true)) { $html = $this->fixHtml($html); } $body = $html->find('body', 0, true); $this->extractText($body); foreach ($body->find("a") as $link) { $links[] = $link->href; } //cata memorie consuma //si eliberare referinte pierdute $html->clear(); MemoryManagement::showUsage('before cleaning', true, 'KB'); MemoryManagement::clean(true); MemoryManagement::showUsage('after cleaning', true, 'KB'); return $links; } catch (Exception $ex) { Applog::exceptionLog($ex); } }
/** * HTML属性削除処理の実行 * * Pickles2の状態を参照し、自動的に処理を振り分けます。 * * - パブリッシュする場合、DECコメントを削除します。 * - プレビューの場合、DECライブラリを埋め込み、URIパラメータからDECの表示・非表示を切り替えられるようにします。 * * @param object $px Picklesオブジェクト * @param object $options オプション * @return boolean true */ public static function exec($px, $options = null) { require_once __DIR__ . '/simple_html_dom.php'; if (!$px->is_publish_tool()) { // パブリッシュ時にのみ働きます。 return true; } if (!@is_array($options->attrs)) { @($options->attrs = array()); } // var_dump($options); foreach ($px->bowl()->get_keys() as $key) { $src = $px->bowl()->pull($key); // HTML属性を削除 $html = str_get_html($src, true, true, DEFAULT_TARGET_CHARSET, false, DEFAULT_BR_TEXT, DEFAULT_SPAN_TEXT); foreach ($options->attrs as $attr) { $ret = $html->find('*[' . $attr . ']'); foreach ($ret as $retRow) { // var_dump($retRow->$attr); $retRow->{$attr} = null; } } $src = $html->outertext; $px->bowl()->replace($src, $key); } return true; }
/** * * @param string $html */ public function exec($html) { mb_language('Japanese'); // 1.プリプロセス // scriptテキスト削除 // script内に文字列リテラルの閉じタグがあるとDomDocumentがscriptのソースを#text扱いしてしまうので // script内の文字を削除する // 正規表現で削除しようとするとSegmentation faultが発生する(StackOverFlow?)ので // simple_html_domでscript内文字列を削除 // MAX_FILE_SIZEの制限にひっかかったので、ソースを編集してデフォルトの3倍に変更している $simpleHtml = str_get_html($html); foreach ($simpleHtml->find('script') as $script) { $script->innertext = ''; } $html = $simpleHtml->outertext; // トリム // $html = preg_replace('/(\s| )+/mi', ' ', $html); // 2. dom生成 $doc = new DomDocument("1.0", "utf-8"); @$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); $node = $doc->getElementsByTagName('body')->item(0); $this->preProcessedInput = $node->textContent; // 3.プロパティを初期化 $this->domXPath = new DomXPath($doc); $this->title = @$doc->getElementsByTagName('title')->item(0)->textContent; $text = $this->scan($node); $this->textAll = $text; $this->domCountAll = $this->domCount; $this->pancutuationCountAll = $this->calcKutenScore($text) + $this->calcTotenScore($text); $this->textLengthAll = mb_strlen($text); $this->highScore = -1000000; $this->extracedNode = null; // 4.実行 $this->extract($node); }
function parse($url) { $url = $this->readUrl($url); var_dump($url); die; if (!$url or $this->cacheurl[$url] or $this->cacheurl[preg_replace('#/$#', '', $url)]) { return false; } var_dump('111'); die; $this->_allcount--; if ($this->_allcount <= 0) { return false; } $this->cacheurl[$url] = true; $item = array(); $data = str_get_html(request($url)); $item['url'] = $url; $item['title'] = count($data->find('title')) ? $data->find('title', 0)->plaintext : ''; $item['text'] = $data->plaintext; $this->result[] = $item; if (count($data->find('a'))) { foreach ($data->find('a') as $a) { $this->parse($a->href); } } $data->clear(); unset($data); }
function translate($lang = 'ru') { $config = App; $content = ob_get_contents(); ob_end_clean(); if (self::$active) { self::$locale = self::getAppLocale($lang); preg_match_all(LOCALE_TEMPLATE, $content, $mathes); if (sizeof($mathes[1]) > 0) { foreach ($mathes[1] as $word) { if (array_key_exists($word, self::$locale)) { $content = str_replace(sprintf(LOCALE_TEMPLATE_CHANGE, $word), self::$locale[$word], $content); } } } } if (($selector = Request::post('html_element')) || ($selector = Request::get('html_element'))) { Load::dependence('simple_html_dom'); $html = str_get_html($content); $elements = $html->find($selector); if (sizeof($elements) > 0) { echo $elements[0]->innertext; } else { echo $content; } } else { echo $content; } }
protected function parse_links() { $get = $this->get(self::URL); $response = $this->getResponse(); $content = str_get_html($response); $domain = substr(self::URL, 0, -1); $box = $content->find('#navi-products', 0); $a = $box->find('a'); for ($i = 0; $i < count($a); $i++) { $link = trim($a[$i]->getAttribute('href')); if (strpos($link, 'produkte-a-bis-z.htm') !== false) { continue; } $cat_con = $this->get_content($link, 'body'); $this->position = 1; if ($cat_con->find('#product-wrapper')) { var_dump('from wroapper: ' . $link); $this->parse_pagination($cat_con->find('#site', 0), $link); } elseif ($cat_list_entry = $cat_con->find('.category-list-entry')) { for ($y = 0; $y < count($cat_list_entry); $y++) { $link_entry = $cat_list_entry[$y]->find('.category-name-number', 0)->find('a', 0)->getAttribute('href'); var_dump("from entry: " . $link_entry); $entry_con = $this->get_content($link_entry); $this->parse_pagination($entry_con->find('#site', 0), $link_entry); } } } // $this->save_data(); }
function handle_products($product_link) { global $base_url_host, $base_url_scheme, $total; if (!empty($product_link)) { $link_3 = $product_link; $cat_raw = str_replace("http://www.thule.com/en-US/US/Products/", "", $product_link); $cats = dirname($cat_raw); $cat_terms = array("Base-Racks/Feet", "Base-Racks/LoadAccessories", "Base-Racks/LoadBars", "Bike-Carriers/Accessories", "Bike-Carriers/Hitch", "Bike-Carriers/RearDoor", "Bike-Carriers/RoofCarriers", "Bike-Carriers/SpareTire", "Bike-Carriers/TruckBed", "Cargo-Carriers/Bags", "Cargo-Carriers/Baskets", "Cargo-Carriers/Boxes", "Cargo-Carriers/HitchCargo", "Luggage/DaypacksAndMessengers", "Luggage/LaptopAndTablet", "Luggage/LuggageAndDuffels", "Snow-Chains/SnowChains", "Snowsports/Accessories", "Snowsports/HitchSki", "Snowsports/SkiBoxes", "Snowsports/SkiCarriers", "Watersports/Accessories", "Watersports/WatersportCarriers"); $cat_cleaned = array("Base Racks/Feet", "Base Racks/Load Accessories", "Base Racks/Load Bars", "Bike Carriers/Accessories", "Bike Carriers/Hitch", "Bike Carriers/Rear Door", "Bike Carriers/Roof Carriers", "Bike Carriers/Spare Tire", "Bike Carriers/Truck Bed", "Cargo Carriers/Bags", "Cargo Carriers/Baskets", "Cargo Carriers/Boxes", "Cargo Carriers/Hitch Cargo", "Luggage/Daypacks And Messengers", "Luggage/Laptop And Tablet", "Luggage/Luggage And Duffels", "Snow Chains/Snow Chains", "Snowsports/Accessories", "Snowsports/Hitch Ski", "Snowsports/Ski Boxes", "Snowsports/Ski Carriers", "Watersports/Accessories", "Watersports/Watersport Carriers"); $cat = str_replace($cat_terms, $cat_cleaned, $cats); $html_content = scraperwiki::scrape($link_3); $html = str_get_html($html_content); $name_raw = trim($html->find("div[@class='column details_overview'] h2 span", 0)); $name = !empty($name_raw) ? strip_tags($name_raw) : ""; $desc_raw = trim($html->find("div[@class='column details_overview'] h3 span", 0)); $desc = !empty($desc_raw) ? strip_tags($desc_raw) : ""; $price_raw = trim($html->find("div[@class='pricing'] span[@id='phcontent_0_ctl00_lblPriceText']", 0)); $price = strip_tags($price_raw); $price = str_replace("MSRP \$", "", $price); $price = trim(str_replace(" (USD)", "", $price)); $image = $html->find("img[@id='imgProductBomImage_0']", 0)->src; echo "{$name}: {$image}\n"; // Add it to an array. $record = array('id' => $total, 'product_name' => trim($name), 'desciption' => trim($desc), 'price' => $price, 'img' => $image, 'category' => $cat); // Add it to the table. scraperwiki::save_sqlite(array('id'), array($record), "products_support", 2); // Increment the 'id' counter. $total++; } }
function parseCategoryData($curl, $url, $catIds) { curl_setopt($curl, CURLOPT_URL, $url); $output = curl_exec($curl); $result = array(); $categoryDom = str_get_html($output); $catData = $categoryDom->find('form[name="categories"]', 0); /** * 0 -> Text notice * 1 -> Category name * 2 -> Category image * 3 -> Upload image * 4 -> Sort Order * 5 -> Buttons */ $catElem = $catData->find('td'); $parentId = $catIds['cPath']; if (strpos($parentId, '_')) { $parentId = explode('_', $parentId); $parentId = $parentId[1]; } $result['parent_id'] = $parentId; $result['category_id'] = $catIds['cID']; $result['category_name'] = $catElem[1]->find('input', 0)->value; // Usually parent categories don't have an image assigned. $image = $catElem[2]->find('img', 0)->src; $result['category_image'] = $image === '/images/' ? null : $image; $result['sort_order'] = $catElem[4]->find('input[name="sort_order"]', 0)->value; $result['link'] = $url; return $result; }
protected function parse_links() { $this->parse_angelbote(); $this->parse_marken(); $get = $this->get(self::URL); $response = $this->getResponse(); $content = str_get_html($response); $box = $content->find('.kategorien', 0); $a = $box->find('a'); for ($i = 0; $i < count($a); $i++) { $kats = $a[$i]->getAttribute('href'); $this->position = 1; $kat_1 = $this->get_content($kats, '.kat_gruppe'); $a1 = $kat_1->find('a'); for ($y = 0; $y < count($a1); $y++) { $kats_1 = $a1[$y]->getAttribute('href'); $this->position = 1; $con = $this->parse_pagination($kats_1, $kats_1); while (TRUE) { $pagination = $con->find('.paging', 0); if (!$pagination || !$pagination->find('.paginierung', 0)->find('.text_rechts', 0)) { break; } $next = $pagination->find('.paginierung', 0)->find('.text_rechts', 0)->find('span', 0)->find('a', 0); if (!$next) { break; } $link = $next->getAttribute('href'); $con = $this->parse_pagination($link, $kats_1); } } } // $this->save_data(); }
/** * 获取url指向的网页内容 url2html * @author hani <[email]> * @param string url [description] * @return string html [description] */ function url2html($url = '', $header) { // header("Content-Type:text/html; charset=gbk2333"); import("Org.Net.simple_html_dom"); $timeout = 15; //构造请求头 if (!isset($header)) { $header = array("User-Agent : Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language: zh,zh-cn;q=0.8,en-us;q=0.5,en;q=0.3", "Cookie: Hm_lvt_aedd3972ca50f4fd67b4d7e025fa000c=1421985654,1422084096,1422084097,1422176563; bdshare_firstime=1421560690892; PHPSESSID=hu43skm3rnkof8qdvdngqmpnq7; Hm_lpvt_aedd3972ca50f4fd67b4d7e025fa000c=1422176873; sso_back_url=%7B%220%22%3A%22index%5C%2Fintern%22%2C%22id%22%3A8395%7D"); } //1 初始化 $ch = curl_init(); //2 设置变量 curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); //3 执行并获取html文档 $output = curl_exec($ch); if ($output === FALSE) { echo "curl error: " . curl_error($ch); } $info = curl_getinfo($ch); echo '获取' . $info['url'] . '耗时' . $info['total_time'] . '秒'; //dump($info); //4 释放curl句柄 curl_close($ch); $html = str_get_html($output); return $html; }
public function news($id) { $this->load->library('curl'); $this->curl->referer('m.vivanews.com'); $this->curl->userAgent('midp 2.0'); $html = $this->curl->openGet("http://m.vivanews.com/news/read/" . $id); // normalize web information $html = str_replace('"', '"', $html); $html = str_replace(' ', ' ', $html); $this->load->helper('dom'); $dom = str_get_html($html); $content = $dom->find('div[class=content]', 0); $title = $content->find('span[class=judul]', 0)->innertext; $title = trim($title); $i = 1; $description = ''; foreach ($content->find('p') as $p) { if ($i == 2) { $strong = $p->find('strong', 0); $name = trim($strong->innertext); $strong->outertext = ''; $description .= trim($p->innertext); } else { if ($i > 2) { $description .= trim($p->plaintext); } } $i++; } $elements = array(array('name' => normalize_html($name), 'description' => normalize_html($description))); $this->load->library('rafa'); $this->rafa->addHeading('vivanews'); $this->rafa->addList('news', $elements, $title); $this->rafa->endRafa(); }
function SendRequest($url, $data, $header) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'SSLv3'); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // не проверять SSL сертификат curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE); // начинаем новую сессию и перезаписываем cookies // curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 7); // ждём 30сек. при попытке соединения curl_setopt($ch, CURLOPT_COOKIEJAR, "./_cookies.txt"); curl_setopt($ch, CURLOPT_COOKIEFILE, "./_cookies.txt"); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $data); curl_setopt($ch, CURLOPT_HTTPHEADER, $header); // curl_setopt($ch, CURLOPT_HEADER, TRUE); $result = curl_exec($ch); if (curl_errno($ch)) { echo 'Ошибка curl: ' . curl_error($ch); } $html = str_get_html($result); return $html; }
function getQuote($url, $string) { require_once 'simple_html_dom.php'; $curl = curl_download($url); $html = str_get_html($curl); $all = $html->find('.yfi_rt_quote_summary_rt_top'); //Gets just the quote info ($all is for 'all' the quote info) $all = $all[0]; //Get each # seperately: $price = $all->find("[id^=yfs_l10_{$string}]"); $change = $all->find("[id^=yfs_c10_{$string}]"); $percentChange = $all->find("[id^=yfs_p20_{$string}]"); $date = $html->find('[id=yfs_market_time]'); //Remove HTML: $price = $price[0]->innertext; $change = $change[0]->innertext; $percentChange = $percentChange[0]->innertext; $change = preg_replace('/^.*?>\\s*/', '', $change); //Removes the updown image $date = $date[0]->innertext; $date = preg_replace('/EST.*/', 'EST', $date); //Removes the updown image return array($price, $date, $change, $percentChange); //return array($price, $date); }
public function do_aal_help_notes() { // do_ + page slug + _ + tab slug include_once AmazonAutoLinks_Commons::$strPluginDirPath . '/include/library/simple_html_dom.php'; $_oHTML = str_get_html($this->arrWPReadMe['remaining_content']); $_oHTML->find('h3', 0)->outertext = ''; $_oH3_1 = $_oHTML->find('h3', 1); if (is_object($_oH3_1)) { $_oH3_1->outertext = ''; } $_sTOC = ''; $_iLastLevel = 0; foreach ($_oHTML->find('h4,h5,h6') as $_oHTag) { // original: foreach($html->find('h1,h2,h3,h4,h5,h6') as $_oHTag $_sInnerTEXT = trim($_oHTag->innertext); $_sID = str_replace(' ', '_', $_sInnerTEXT); $_oHTag->id = $_sID; // add id attribute so we can jump to this element $_iLevel = intval($_oHTag->tag[1]); if ($_iLevel > $_iLastLevel) { $_sTOC .= "<ol>"; } else { $_sTOC .= str_repeat('</li></ol>', $_iLastLevel - $_iLevel); $_sTOC .= '</li>'; } $_sTOC .= "<li><a href='#{$_sID}'>{$_sInnerTEXT}</a>"; $_iLastLevel = $_iLevel; } $_sTOC .= str_repeat('</li></ol>', $_iLastLevel); echo $_sTOC . "<hr />" . $_oHTML->save(); }
public function consulta($cnpj, $cpf, $codigo, $captcha, $stringCookie, $viewState, $eventValidation) { $jar = new \GuzzleHttp\Cookie\CookieJar(); $requisicao = new Client(['cookies' => true]); $param = ['form_params' => ['__VIEWSTATE' => $viewState, '__EVENTVALIDATION' => $eventValidation, 'ctl00$txtBusca' => '', 'ctl00$ContentPlaceHolder$txtCNPJ' => $cnpj, 'ctl00$ContentPlaceHolder$txtCPFResponsavel' => $cpf, 'ctl00$ContentPlaceHolder$txtCodigoAcesso' => $codigo, 'txtTexto_captcha' => $captcha, 'hdn_client_id' => '00000000000000000000000000000000', 'ctl00$ContentPlaceHolder$btContinuar' => 'Continuar'], 'headers' => ['Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding' => 'gzip, deflate', 'Accept-Language' => 'pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4', 'Cache-Control' => 'max-age=0', 'Connection' => 'keep-alive', 'Content-type' => 'application/x-www-form-urlencoded', 'Cookie' => $stringCookie, 'Host' => 'www8.receita.fazenda.gov.br', 'Origin' => 'http://www8.receita.fazenda.gov.br', 'Referer' => 'http://www8.receita.fazenda.gov.br/SimplesNacional/controleAcesso/Autentica.aspx?id=6', 'Upgrade-Insecure-Requests' => '1', 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'], 'timeout' => 20, 'connect_timeout' => 20]; $resposta = $requisicao->post('http://www8.receita.fazenda.gov.br/SimplesNacional/controleAcesso/Autentica.aspx?id=6', $param); require_once 'simple_html_dom.php'; $html = str_get_html($resposta->getBody()->getContents()); // echo $resposta->getBody()->getContents(); $erros = $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblErroCaptcha'); if (is_object($erros)) { $mensagemErro = 'Ocorreu algum erro, tente novamente mais tarde'; switch (trim($erros->plaintext)) { case 'Caracteres anti-robô inválidos. Tente novamente.': $mensagemErro = 'Erro ao consultar. Verifique se digitou corretamente o captcha.'; break; case 'O CNPJ informado deve conter 14 dígitos.': $mensagemErro = 'Erro ao consultar. CNPJ deve conter 14 dígitos.'; break; case 'O CNPJ digitado é inválido.': $mensagemErro = 'Erro ao consultar. CNPJ inválido.'; break; } throw new \Exception($mensagemErro, 99); } return str_replace('./', 'http://www8.receita.fazenda.gov.br/SimplesNacional/Aplicacoes/ATSPO/pgdasd.app/', $resposta->getBody()); return ['cnpj' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblCNPJ')->plaintext, 'nome_empresarial' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblNomeEmpresa')->plaintext, 'situacao_simples_nacional' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblSituacaoSimples')->plaintext, 'situacao_simei' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblSituacaoMei')->plaintext, 'opcoes_pelo_simples_nacional_periodos_anteriores' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblPeriodoAnterior > b > font')->plaintext, 'opcoes_pelo_simei_periodos_anteriores' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblSIMEIPeriodosAnteriores > b > font')->plaintext, 'agendamentos_simples_nacional' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblAgendamentosOpcaoSinac > b > font')->plaintext, 'eventos_futuros_simples_nacional' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblEventosFuturos > b > font')->plaintext, 'eventos_futuros_simei' => $html->getElementById('ctl00_ContentPlaceHolderConteudo_lblEventosFuturosSimei > b > font')->plaintext]; }
function scrape_items($url, $number) { $active_user = $_SESSION['user_id']; $data = postForm($url); $html = str_get_html($data); //echo $data;die; $max = 0; if ($number) { $max = $number; } $j = 0; foreach ($html->find('ul[id=result-products]') as $bloc) { foreach ($bloc->find('li[class=product] a[class=pro-thumb]') as $item) { $url = $item->href; $dataitem = postForm($url); $htmlitem = str_get_html($dataitem); foreach ($htmlitem->find('span[id=itmNum]') as $itemnumber) { $tmp = $itemnumber->plaintext; preg_match_all('!\\d+!', $tmp, $matches); $itemnumber = $matches[0][0]; if ($itemnumber != '' && $j < $max) { $sql = "INSERT INTO asins_table(asins,UserID,processed,provider) values('" . $itemnumber . "'," . $active_user . ",0,'Overstock')"; mysql_query($sql) or die(mysql_error()); $j++; } } } } return true; //return $result; }
function task_modifier_scan_text($string) { $CI =& get_instance(); $CI->config->load('geshi'); $highlight_map = $CI->config->item('geshi_highlighting_map'); $html = str_get_html($string, true, true, DEFAULT_TARGET_CHARSET, false); foreach ($html->find('pre br') as $br) { $br->outertext = "\n"; } foreach ($html->find('pre.highlight') as $highlighted_code) { $lang = $highlighted_code->lang; if (isset($highlight_map[$lang])) { $content = str_replace(' ', ' ', htmlspecialchars_decode(strip_tags($highlighted_code->innertext), ENT_HTML5 | ENT_QUOTES)); $geshi = new GeSHi($content, $highlight_map[$lang]); $geshi->set_header_type(GESHI_HEADER_PRE_VALID); $geshi->enable_line_numbers(GESHI_NO_LINE_NUMBERS); $highlighted_code->innertext = $geshi->parse_code(); $highlighted_code->lang = null; } else { $highlighted_code->lang = null; $highlighted_code->class = null; } } ob_start(); echo $html; return ob_get_clean(); }
public function collectData(array $param) { $html = ''; if (isset($param['u'])) { $this->request = $param['u']; $text_html = file_get_contents(urldecode($this->request)) or $this->returnError('No results for this query.', 404); $text_html = iconv('windows-1251', 'utf-8', $text_html); $html = str_get_html($text_html); } foreach ($html->find('div.post_table') as $post) { if (is_object($post->find('a.wall_post_more', 0))) { $post->find('a.wall_post_more', 0)->outertext = ''; //delete link "show full" in content } $item = new \Item(); $item->content = strip_tags($post->find('div.wall_post_text', 0)->innertext); if (is_object($post->find('a.page_media_link_title', 0))) { $link = $post->find('a.page_media_link_title', 0)->getAttribute('href'); $item->content .= "\n\rExternal link: " . str_replace('/away.php?to=', '', urldecode($link)); //external link in the post } //get video on post if (is_object($post->find('span.post_video_title_content', 0))) { $titleVideo = $post->find('span.post_video_title_content', 0)->plaintext; $linkToVideo = 'https://vk.com' . $post->find('a.page_post_thumb_video', 0)->getAttribute('href'); $item->content .= "\n\r {$titleVideo}: {$linkToVideo}"; } $item->uri = 'https://vk.com' . $post->find('.reply_link_wrap', 0)->find('a', 0)->getAttribute('href'); // get post link $item->date = $post->find('span.rel_date', 0)->plaintext; $this->items[] = $item; // var_dump($item->date); } }
function buildDataFile($files) { foreach ($files as $file) { $fileData = file_get_contents('../' . $file, true); $html = str_get_html($fileData); foreach ($html->find('.auto-nav') as $navigation) { if (isset($navigation->autocms)) { $desc = preg_replace("/[^a-z^A-Z^0-9_-]/", "", $navigation->autocms); $this->data[$desc] = array('text' => $navigation->innertext, 'description' => $navigation->autocms, 'type' => 'text'); $navigation->innertext = "<?=get('{$this->dataFile}', '{$desc}')?>"; $navigation->href = str_replace(array('index.html', 'index.htm', '.html', '.htm'), '/', '/' . $navigation->href); $navigation->href = str_replace('//', '/', $navigation->href); $navigation->class = str_replace('auto-nav', '', $navigation->class); if (trim($navigation->class) === '') { $navigation->class = null; } } } foreach ($html->find('.auto-nav-internal') as $navigation) { $navigation->href = str_replace(array('index.html', 'index.htm', '.html', '.htm'), '/', '/' . $navigation->href); $navigation->href = str_replace('//', '/', $navigation->href); $navigation->class = str_replace('auto-nav-internal', '', $navigation->class); if (trim($navigation->class) === '') { $navigation->class = null; } } $fp = fopen('../' . $file, 'w'); fwrite($fp, $html); fclose($fp); } }
function scrapeCourses($link, $term) { $ckfile = tempnam("/tmp", "CURLCOOKIE"); $ckfile2 = tempnam("/tmp", "CURLCOOKIE2"); $ch = curl_init(); curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_URL, $this->mainURL); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_exec($ch); curl_close($ch); $ch = curl_init(); curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile); curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile2); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_URL, $term['termlink']); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_exec($ch); curl_close($ch); $ch = curl_init(); curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile2); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_URL, $link); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); $data = curl_exec($ch); curl_close($ch); return str_get_html($data); }