function clubURL($url) { $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext)); $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); }
public function save($html, $dir) { import("@.ORG.htmltodocx.documentation.support_functions"); $phpword_object = new PHPWord(); $section = $phpword_object->createSection(); // HTML Dom object: $html_dom = new simple_html_dom(); $html_dom->load('<html><body>' . $html . '</body></html>'); // Note, we needed to nest the html in a couple of dummy elements. // Create the dom array of elements which we are going to work on: $html_dom_array = $html_dom->find('html', 0)->children(); // We need this for setting base_root and base_path in the initial_state array // (below). We are using a function here (derived from Drupal) to create these // paths automatically - you may want to do something different in your // implementation. This function is in the included file // documentation/support_functions.inc. $paths = htmltodocx_paths(); // Provide some initial settings: $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example()); // Convert the HTML and put it into the PHPWord object htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state); // Clear the HTML dom object: $html_dom->clear(); unset($html_dom); // Save File $str = explode(".", $h2d_file_uri); $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx"; if (!file_exists($dir . "wordtemp/")) { $this->createFolders($dir . "wordtemp/"); //判断目标文件夹是否存在 } $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007'); $objWriter->save($h2d_file_uri); return $h2d_file_uri; }
public function parse($isUpdate = false) { Ibos::import("application.extensions.simple_html_dom", true); if ($isUpdate) { $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel); $max = 0; } else { $model = $this->printmodel; $max = intval($this->itemmax); } $elements = array(); $doc = new simple_html_dom(); $doc->load($model, true, true, CHARSET); $items = $doc->find("ic"); $config = $this->getItemConfig(); if (!empty($items) && !empty($config)) { $this->refactor($items, $config, $max, $elements); } $html = $doc->save(); $this->_cache = $elements; CacheUtil::set("form_" . $this->ID, $elements); $form["printmodelshort"] = $html; if ($max != $this->itemmax) { $form["itemmax"] = $max; } $doc->clear(); FlowFormType::model()->modify($this->ID, $form); }
function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) { $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); if (empty($str) || strlen($str) > MAX_FILE_SIZE) { $dom->clear(); return false; } $dom->load($str, $lowercase, $stripRN); return $dom; }
function generate_docx($html, $file_path, &$file_takeout_tmp_files) { $phpword_object = new PHPWord(); $section = $phpword_object->createSection(); $html_dom = new simple_html_dom(); $html_dom->load($html); $html_dom_array = $html_dom->find('html', 0)->children(); $paths = htmltodocx_paths(); $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => FALSE, 'style_sheet' => htmltodocx_styles(), 'download_img_path' => elgg_get_data_path(), 'download_img_tmp' => &$file_takeout_tmp_files); htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state); $html_dom->clear(); unset($html_dom); $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007'); // Word2007 is the only option :-( $objWriter->save($file_path); }
private function clean_children(&$a_sHTML) { $l_sTmp = '<crawler>' . $a_sHTML . '</crawler>'; $l_oTheHtml = new simple_html_dom(); $l_oTheHtml->load($l_sTmp); $l_sResult = $l_oTheHtml->find('crawler', 0); $x = (string) $l_sResult->innertext; for ($i = 0; $i < sizeof($l_sResult->children()); $i++) { $x = str_replace($l_sResult->children($i), '', $x); } $l_oTheHtml->clear(); unset($l_sTmp); unset($l_sResult); unset($l_oTheHtml); return $x; }
function add_h_filter($foo) { $source_html = $foo; $p = new simple_html_dom(); $p->load('<html><body>' . $source_html . '<html><body>'); $hrefs = $p->find("a"); foreach ($hrefs as $elm) { error_log($elm->href); $match; if (preg_match("/\\?page_id\\=(\\d+?)\$/", $elm->href, $match)) { $page_id = $match[1]; $page = get_page($page_id); $content = $page->post_content; $p2 = new simple_html_dom(); $p2->load('<html><body>' . $content . '<html><body>'); $has_id = $p2->find('h1[id]'); $submenu_array = array(); foreach ($has_id as $idh1) { error_log($idh1->id); $h1_id = $idh1->id; $h1_txt = $idh1->plaintext; array_push($submenu_array, array($h1_id, $h1_txt)); } if (sizeof($submenu_array) !== 0) { $submenu = "<ul class='submenu'>\n"; $blogurl = get_bloginfo('url'); foreach ($submenu_array as $sub) { $submenu .= '<li><a href="' . $blogurl . '?page_id=' . $page_id . '#' . $sub[0] . '">' . $sub[1] . '</a></h1>' . "\n"; } $submenu .= '</ul>'; $elm->outertext = $elm->outertext . $submenu; } $p2->clear(); unset($p2); } } $foo = $p->outertext; $p->clear(); unset($p); return $foo; }
function addToTable($text,$position,$button){ $dom = new simple_html_dom(); $dom->load($text); $tableEl = $dom->find('.'.$position,0); if(!$tableEl){ $table = '<table class="'.$position .' myApiShareTable"></table>'; $text = ($position == 'myApiShareTop') ? $table.$text : $text.$table; $dom->load($text); } $text = $dom->save(); $dom->load($text); $rowEl = $dom->find('.'.$position,0)->find('.myApiButtons',0); if(!$rowEl){ $tr = '<tr class="myApiButtons"><td><table><tr><td>'.$button.'</td></tr></table></td></tr>'; $row = $dom->find('.'.$position,0); $row->innertext = $tr.$row->innertext; }else{ $rowEl->find('table',0)->find('tr',0)->innertext = '<td>'.$button.'</td>'.$rowEl->find('table',0)->find('tr',0)->innertext; } $text = $dom->save(); $dom->load($text); $commentsTable = $dom->find('.myApiShareBottom',0); if($commentsTable){ $commentsEl = $commentsTable->find('.myApiCommentsCell',0); if($commentsEl){ $buttonRow = $commentsTable->find('.myApiButtons',0); if($buttonRow){ $commentsEl->colspan = sizeof($buttonRow->find('td')); $text = $dom->save(); } } } $dom->clear(); unset($dom); return $text; }
public static function _Process_Recieved_Content($_HTML_CONTENT, $_Cung1, $_Cung2, $_Summary, $_SourceUri, $_LinkId, $_ImageLink) { if ($_HTML_CONTENT != '') { // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($_HTML_CONTENT); unset($_HTML_CONTENT); if ($html) { $story = new Model_Horoscope_XungHop(); $ktra = true; if ($_Cung1 == '-' || $_Cung2 == '-') { $ktra = FALSE; } $story->cung_1 = $_Cung1; $story->cung_2 = $_Cung2; $story->alias = $_Cung1 . '_' . $_Cung2; if (self::CheckRecordByAlias($story->alias)) { $story->alias = $_Cung1 . '__' . $_Cung2; } $story->tom_tat = $_Summary; $story->ngay_tao = date("Y-m-d"); $story->url_nguon = $_SourceUri; $story->auto_get = true; //begin find elements #find date post $date = $html->find('div[class="datetime"]', 0); if ($date) { $d = explode(',', $date->plaintext); if (isset($d[1])) { //var_dump($d); //exit; $d1 = explode(' ', trim($d[1])); list($ngay, $thang, $nam) = explode('/', $d1[0]); $story->source_date = date("Y-m-d h:i:s", strtotime($nam . '-' . $thang . '-' . $ngay . ' ' . $d1[1] . ':00')); } else { $story->source_date = date("Y-m-d h:i:s"); } } else { $story->source_date = date("Y-m-d h:i:s"); } //find content $content = $html->find('div[id="content_document"]', 0); if ($content) { $string = $content->innertext; # remove white space $string = str_replace(array("\r\n", "\r", "\n", "\t"), '', $string); $string = preg_replace('/(<!--.+?-->)/s', '', $string); $string = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $string); $string = preg_replace('/<p[ ]class="pAuthor">.*<\\/p>/ismxU', '', $string); $string = preg_replace('/<p[ ]class="pSource">.*<\\/p>/ismxU', '', $string); $story->noi_dung = $string; $story->kiem_tra = $ktra; $story->save(); if ($story->identifier()) { if ($ktra) { //get image thumb => save to disk => update record in db $path = 'assets/horoscope/xung-hop/' . $story->alias . '/'; $img = Vendor_Crawler::get_file_from_url_by_curl($_ImageLink, $save_to_path = $path, $file_name_to_set = $story->alias . '-thumb'); if ($img) { //check file size, if = 0 -> mean file can't get if (filesize($img) == 0) { @copy('assets/horoscope/thumb_140.jpg', $img); } $story->hinh_anh = '/' . $img; } else { $story->hinh_anh = $_ImageLink; } } else { $story->hinh_anh = $_ImageLink; } if ($ktra != FALSE) { //print_r($img); $html2 = new simple_html_dom(); $html2->load($story->noi_dung); $images = $html2->find('img'); if (count($images) > 0) { for ($i = 0; $i < count($images); $i++) { unset($images[$i]->onclick); $file_name = 'anh_' . $i + 1; $get_file = Vendor_Crawler::get_file_from_url_by_curl($images[$i]->src, $save_to_path = $path, $file_name_to_set = $file_name); if (filesize(ltrim($get_file, '/')) == 0) { unset($images[$i]); } else { $images[$i]->src = '/' . $get_file; } } } $story->noi_dung = $html2->save(); $html2->clear(); unset($html2); } else { $story->hinh_anh = $_ImageLink; } $story->save(); //insert done => update from tmp table Model_Horoscope_XungHopLinkBLL::UpdateRecordStatus($_LinkId); self::_print_to_console('Done: ' . $_SourceUri); } else { self::_print_to_console('Fail:' . $_SourceUri); } } else { self::_print_to_console('-> content not found'); return false; } $html->clear(); unset($html); } else { self::_print_to_console('-> cant load DOM obj'); return false; } } else { self::_print_to_console('-> nothing to do'); return false; } }
function scrapeHTML($param, $type) { $html = scraperWiki::scrape(BASE_URL . "?type={$param}"); $dom = new simple_html_dom(); $dom->load($html); // Iterate over table rows and get flight details. foreach ($dom->find("TR[@HEIGHT='25']") as $data) { // Flight details. $tds = $data->find("td"); $airline = removeSpaces($tds[0]->plaintext); $flight_type = $type; $flight_num = removeSpaces($tds[1]->plaintext); $destination = removeSpaces($tds[2]->plaintext); $time = removeSpaces($tds[3]->plaintext); $gate = removeSpaces($tds[4]->plaintext); $remarks = removeSpaces($tds[5]->plaintext); // Skip header row. Cheesy, but effective. if ($airline == "Airline") { continue; } // Set the date. $date = date("m.d.y"); // Build up record to store. $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks); // Save the record. saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data); } $dom->clear(); }
/** * New method to parse page content navigating the dom and replacing found elements with modified HTML to acomodate LBP appropriate HTML * * @param mixed $content * @return mixed */ function lightboxPlusReplace($html_content, $unq_id) { global $post; if (!empty($this->lightboxOptions)) { $lightboxPlusOptions = $this->getAdminOptions($this->lightboxOptionsName); } /** * Remove following line after a few versions or 2.6 is the prevelent version */ $lightboxPlusOptions = $this->setMissingOptions($lightboxPlusOptions); $postGroupID = $post->ID; $postGroupTitle = $post->post_title; $html = new simple_html_dom(); $html->load($html_content, false, false); /** * Find all image links (text and images) * * If (autolightbox text links) then */ switch ($lightboxPlusOptions['text_links']) { case 1: foreach ($html->find('a[href*=jpg$], a[href*=gif$], a[href*=png$], a[href*=jpeg$], a[href*=bmp$]') as $e) { /** * Use Class Method is selected - yes/no */ switch ($lightboxPlusOptions['output_htmlv']) { case 1: $htmlv_prop = 'data-' . $lightboxPlusOptions['data_name']; switch ($lightboxPlusOptions['use_class_method']) { case 1: if ($e->class && $e->class != $lightboxPlusOptions['class_name']) { $e->class .= ' ' . $lightboxPlusOptions['class_name']; if (!$e->{$htmlv_prop}) { $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } } else { $e->class = $lightboxPlusOptions['class_name']; if (!$e->{$htmlv_prop}) { $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } } break; default: if (!$e->{$htmlv_prop}) { $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } break; } break; default: switch ($lightboxPlusOptions['use_class_method']) { case 1: if ($e->class && $e->class != $lightboxPlusOptions['class_name']) { $e->class .= ' ' . $lightboxPlusOptions['class_name']; if (!$e->rel) { $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } } else { $e->class = $lightboxPlusOptions['class_name']; if (!$e->rel) { $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } } break; default: if (!$e->rel) { $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } break; } break; } /** * Do Not Display Title is select - yes/no */ switch ($lightboxPlusOptions['no_display_title']) { case 1: $e->title = null; break; default: /** * If title doesn't exist then get a title * Set to caption title->image->post title by default then set to image title is exists */ if (!$e->title && $e->first_child()) { if ($e->first_child()->alt) { $e->title = $e->first_child()->alt; } else { $e->title = $postGroupTitle; } } /** * If use caption for title try to get the text from the caption - this could be wrong */ if ($lightboxPlusOptions['use_caption_title']) { if ($e->next_sibling()->class = 'wp-caption-text') { $e->title = $e->next_sibling()->innertext; } elseif ($e->parent()->next_sibling()->class = 'gallery-caption') { $e->title = $e->parent()->next_sibling()->innertext; } } break; } } break; default: /** * find all links with image only else if (do not autolightbox textlinks) then */ foreach ($html->find('a[href*=jpg$] img, a[href*=gif$] img, a[href*=png$] img, a[href*=jpeg$] img, a[href*=bmp$] img') as $e) { /** * Generate HTML5 yes/no */ switch ($lightboxPlusOptions['output_htmlv']) { case 1: $htmlv_prop = 'data-' . $lightboxPlusOptions['data_name']; switch ($lightboxPlusOptions['use_class_method']) { /** * Use Class Method is selected - yes/no */ case 1: if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) { $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name']; if (!$e->parent()->{$htmlv_prop}) { $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } } else { $e->parent()->class = $lightboxPlusOptions['class_name']; if (!$e->parent()->{$htmlv_prop}) { $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } } break; default: if (!$e->parent()->{$htmlv_prop}) { $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']'; } break; } break; default: switch ($lightboxPlusOptions['use_class_method']) { /** * Use Class Method is selected - yes/no */ case 1: if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) { $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name']; if (!$e->parent()->rel) { $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } } else { $e->parent()->class = $lightboxPlusOptions['class_name']; if (!$e->parent()->rel) { $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } } break; default: if (!$e->parent()->rel) { $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']'; } break; } break; } /** * Do Not Display Title is select - yes/no */ switch ($lightboxPlusOptions['no_display_title']) { case 1: $e->parent()->title = null; break; default: if (!$e->parent()->title) { if ($e->title) { $e->parent()->title = $e->title; } else { $e->parent()->title = $postGroupTitle; } } if ($lightboxPlusOptions['use_caption_title']) { //if ($e->parent()->next_sibling()->innertext) { $e->parent()->title = $e->parent()->next_sibling()->innertext; } //if ($e->parent()->next_sibling()->innertext) { $e->title = $e->parent()->next_sibling()->innertext; } if ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->next_sibling()->class = 'wp-caption-text')) { $e->title = $e->next_sibling()->innertext; } elseif ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->parent()->next_sibling()->class = 'gallery-caption')) { $e->title = $e->parent()->next_sibling()->innertext; } } break; } } break; } $content = $html->save(); $html->clear(); unset($html); return $content; }
public function action_sua($story_id) { $this->template->title = __('Sửa bài viết: xung - hợp cung'); $this->template->section_title = __('Sửa bài viết: xung - hợp cung'); $data = array(); $story = Model_Horoscope_XungHopBLL::getInstance()->find($story_id); if ($story) { if (Request::$method == 'POST') { // print_r($_POST); // die(); $post = $story->validate_update($_POST); if ($post->check()) { //begin save $post_values = $post->as_array(); $old_alias = $story->alias; // //alias changed => image changed => directory changed => images in content not get :( $story->hinh_anh = $post_values['hinh_anh']; $story->alias = $post_values['alias']; $story->cung_1 = $post_values['cung_1']; $story->cung_2 = $post_values['cung_2']; $story->tom_tat = trim($post_values['tom_tat']); $story->noi_dung = $post_values['noi_dung']; $story->kiem_tra = true; $story->save(); //print_r($img); // Create a DOM object if ($old_alias != $post_values['alias']) { //remove old folder (if existed when update) @rmdir('assets/horoscope/xung-hop/' . $old_alias . '/'); require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html2 = new simple_html_dom(); $html2->load($story->noi_dung); $images = $html2->find('img'); if ($images) { $i = 1; foreach ($images as $image) { unset($image->onclick); $path = 'assets/horoscope/xung-hop/' . $story->alias . '/'; $file_name = 'anh_' . $i; $get_file = Vendor_Crawler::get_file_from_url_by_curl($image->src, $save_to_path = $path, $file_name_to_set = $file_name); if (filesize($get_file) == 0) { unset($image); } else { $image->src = '/' . $get_file; } $i++; } } $story->noi_dung = $html2->save(); $html2->clear(); unset($html2); $story->save(); } Request::instance()->redirect('admin/horoscope_xunghop/index'); } else { $_POST = $post->as_array(); #Affects errors for further display $data['errors'] = $post->errors(); } } $data['story'] = $story->toArray(); $this->template->content = View::factory('horoscope/admin/xung-hop/sua', $data); } else { Request::instance()->redirect('admin/horoscope_xunghop/index'); } }
public function collect() { $url = trim($this->_post('url')); //返回结果 $res = array('title' => '', 'content' => ''); //分析网页是否包含视频 $video = $this->uVideoUpload($url); if ($video != '10' && $video != '11') { //获取标题 $htm = file_get_html($url); $title = $htm->find('title', 0)->plaintext; $htm->clear(); $res['title'] = $title; $res['content'] = $video; echo json_encode($res); exit; } //不含视频,则按文章处理 $collect = D('collect'); $domin = ''; $match = "/http:\\/\\/([^\\/]*).*/i"; if (!substr_count($url, "http")) { $url = "http://" . $url; } preg_match($match, $url, $out); $domin = $out[1]; if (!empty($domin)) { //分析是不是音乐网站 $music_websites = C('MUSIC_WEBSITES'); if (in_array($domin, $music_websites)) { $htm = file_get_html($url); $p = preg_match('/var\\s*?_xiamitoken\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out); $token = $out[1]; //onclick="playalbum(682938274, '', '时间的歌', ''); $xid = ''; if (preg_match('/playalbum\\((\\d+),\\s*?\'*?\',\\s*?\'(.*?)\',\\s*?\'*?\'\\)/i', $htm, $out)) { //xid $xid = $out[1]; //title $title = $out[2]; } else { if (preg_match('/\\/album\\/(\\d{1,})/', $htm, $out)) { $xid = $out[1]; $title = $htm->find('div#title', 0)->plaintext; } else { if (preg_match('/var\\s*?cid\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out)) { #var cid = '22454617'; $xid = $out[1]; $title = $htm->find('title', 0)->plaintext; } } } if ($xid) { //http://www.xiami.com/ajax/getquote/type/2/id/682938274?_xiamitoken=0802020a13ba3df687e7ca4ef45cf1a8 $zurl = "http://www.xiami.com/ajax/getquote/type/2/id/{$xid}?_xiamitoken={$token}"; $htm = file_get_html($zurl); $content = $htm->find('textarea.tarea', 1)->innertext; $res['title'] = trim($title); $res['content'] = $content; //清除内存消耗 $htm->clear(); } else { $res['title'] = ''; $res['content'] = '没有找到音乐'; } echo json_encode($res); exit; } //查看数据库中是否已经有该域名的记录 $c = $collect->where('domain="' . $domin . '"')->find(); if (!$c) { //没有数据库记录,则title为页面title,content为body正文 $collect->data(array('alias' => $domin, 'domain' => $domin, 'match' => '123'))->add(); //查找body $htm = file_get_html($url); $title = $htm->find('title', 0)->plaintext; $content = $htm->find('body', 0)->innertext; //title取正文的10个左右字符 $res['title'] = $title; $res['content'] = $content; } else { //找到了匹配规则 //新浪博客URL特殊处理,去掉结尾的 ?tj=... if ($domin == 'blog.sina.com.cn') { $url = preg_replace('/\\?tj=.*/i', '', $url); } $htm = file_get_html($url); //获取title $matchlist = $this->collect_match->get_matchlist_by_collect_type($c['id'], self::TYPE_TITLE); if (!empty($matchlist)) { $exec = '$htm'; foreach ($matchlist as $match) { $exec .= "->find( '{$match['match']}', {$match['pos']} )"; } $exec = $exec . '->plaintext;'; eval("\$str = {$exec};"); $res['title'] = $str; } else { $title = $htm->find('title', 0)->plaintext; $res['title'] = $title; } //获取content $matchlist = $this->collect_match->get_matchlist_by_collect_type($c['id'], self::TYPE_CONTENT); if (!empty($matchlist)) { $exec = '$htm'; foreach ($matchlist as $match) { $exec .= "->find( '{$match['match']}', {$match['pos']} )"; } $exec = $exec . '->innertext;'; eval("\$str = {$exec};"); $res['content'] = $str; } else { $content = $htm->find('body', 0)->innertext; $res['content'] = $content; } //某些网站图片特殊处理 if (in_array($domin, array('history.people.com.cn'))) { $res['content'] = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domin . "\$1" . '"', $res['content']); } else { if ($domin == 'blog.sina.com.cn') { //新浪图片,需要把real_src和src属性互相特换 // 新建一个Dom实例 $new_html = new simple_html_dom(); $new_html->load($res['content']); $imgs = $new_html->find('img'); foreach ($imgs as &$img) { $img->src = $img->real_src; $img->real_src = null; } $res['content'] = $new_html->innertext; $new_html->clear(); } } } //释放内存消耗 $htm->clear(); } else { $res['title'] = ''; $res['content'] = ''; echo json_encode($res); exit; } //获取内容处理html标签 $res['content'] = $this->clearhtml($res['content']); $res['title'] = trim($this->clearhtml($res['title'])); //转码处理 $no_need_iconv = C('NO_NEED_ICONV'); if (!in_array($domin, $no_need_iconv)) { if ($domin == 'history.sina.com.cn') { //GBK 编码特殊处理 $res['title'] = iconv("GBK", "UTF-8//IGNORE", $res['title']); $res['content'] = iconv("GBK", "UTF-8//IGNORE", $res['content']); } else { //GB2312 编码处理 $res['title'] = iconv("GB2312", "UTF-8//IGNORE", $res['title']); $res['content'] = iconv("GB2312", "UTF-8//IGNORE", $res['content']); } } echo json_encode($res); }
protected function convertImpl($text) { $descr['book_title'] = $this->nameru; $descr['author'] = ""; foreach ([$this->author, $this->illustrator] as $aut) { if ($aut) { foreach (explode(',', $aut) as $au) { $a = explode(' ', trim($au)); $descr['author'] = (isset($descr['author']) ? $descr['author'] : '') . "<h1>"; $descr['author'] .= $this->escapexml(trim($au)); $descr['author'] .= "</h1>"; } } } $descr['annotation'] = ''; if ($this->annotation) { $this->annotation = preg_replace('@\\n@', '</p><p>', $this->annotation); $this->annotation = preg_replace("@'''(.*?)'''@", '<b>\\1</b>', $this->annotation); $this->annotation = preg_replace("@''(.*?)''@", '<i>\\1</i>', $this->annotation); $this->annotation = preg_replace('@<p></p>@', '<br/>', $this->annotation); $descr['annotation'] = "<h2>Аннотация</h2><p>{$this->annotation}</p>"; } $descr['coverpage'] = ''; $images = []; if ($this->covers) { $innerHeight = $this->height; $cover = $this->covers[0]; $image = $this->images[$cover]; /* Width and height are unimportant. Actual resizing is done not in this class. We must save aspect ratio though. */ $descr['coverpage'] = "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />"; $images[] = $cover; $descr['coverpage_n'] = $cover; } // echo $descr['coverpage']; // exit; if ($this->translators) { foreach ($this->translators as $translator) { if (!array_key_exists('translator', $descr)) { $descr['translator'] = ''; } $descr['translator'] .= "<p name=\"translator\">" . $this->escapexml($translator) . "</p>"; } } if ($this->seriestitle) { $descr['sequence'] = "<h1>" . $this->escapexml($this->seriestitle) . ($this->seriesnum ? " {$this->seriesnum}" : '') . " </h1>"; } $descr['date2'] = date('j F Y, H:i', $this->touched); $descr['id'] = 'RuRa_' . str_replace('/', '_', $this->nameurl); if ($this->isbn) { $descr['isbn'] = ";isbn:{$this->isbn}"; } if ($this->command == 'RuRa-team') { $credit = "<h2>Реквизиты переводчиков</h2>\n \t\t\t\t <p>Над переводом работала команда <b>RuRa-team</b></p>\n"; foreach ($this->workers as $activity => $workers) { $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n"; } $credit .= '<p>Самый свежий перевод всегда можно найти на сайте нашего проекта:</p> <p><a href="http://ruranobe.ru">http://ruranobe.ru</a></p> <p>Чтобы оставаться в курсе всех новостей, вступайте в нашу группу в Контакте:</p> <p><a href="http://vk.com/ru.ranobe">http://vk.com/ru.ranobe</a></p> <p>Для желающих отблагодарить переводчика материально имеются webmoney-кошельки команды:</p> <p><b>R125820793397</b></p> <p><b>U911921912420</b></p> <p><b>Z608138208963</b></p> <p>QIWI-кошелек:</p> <p><b>+79116857099</b></p> <p>Яндекс-деньги:</p> <p><b>410012692832515</b></p> <p>PayPal:</p> <p><b>paypal@ruranobe.ru</b></p> <p>А так же счет для перевода с кредитных карт:</p> <p><b>4890 4941 5384 9302</b></p> <p>Версия от ' . date('d.m.Y', $this->touched) . '</p> <p></p> <p></p> <p></p> <p><b>Любое распространение перевода за пределами нашего сайта запрещено. Если вы скачали файл на другом сайте - вы поддержали воров</b></p> <p></p> <p></p> <p></p>'; } elseif (strpos($this->command, 'RuRa-team') !== false) { $credit = "<h2>Реквизиты переводчиков</h2>\n\t\t\t\t\t\t <p>Над релизом работали {$this->command}</p>\n"; foreach ($this->workers as $activity => $workers) { $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n"; } $credit .= '<p>Самый свежий перевод всегда можно найти на сайте нашего проекта:</p> <p><a l:href="http://ruranobe.ru">http://ruranobe.ru</a></p> <p>Чтобы оставаться в курсе всех новостей, вступайте в нашу группу в Контакте:</p> <p><a l:href="http://vk.com/ru.ranobe">http://vk.com/ru.ranobe</a></p> <p>Версия от ' . date('d.m.Y', $this->touched) . '</p> <p><b>Любое коммерческое использование данного текста или его фрагментов запрещено</b></p>'; } else { $credit = "<h2>Реквизиты переводчиков</h2>"; if ($this->command) { $credit .= "<p>Перевод команды {$this->command}</p>"; } foreach ($this->workers as $activity => $workers) { $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n"; } $credit .= '<p>Версия от ' . date('d.m.Y', $this->touched) . '</p> <p><b>Любое коммерческое использование данного текста или его фрагментов запрещено</b></p>'; } if ($this->height == 0) { $text = preg_replace('/(<p[^>]*>)?<img[^>]*>(<\\/p>)?/u', '', $text); } else { for ($i = 1; $i < count($this->covers); ++$i) { $image = $this->images[$this->covers[$i]]; $text = "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />" . $text; } $text = preg_replace_callback('/(<a[^>]*>)?<img[^>]*data-resource-id="(-?\\d*)"[^>]*>(<\\/a>)?/u', function ($match) use(&$images) { if ($match[2] < 0) { return ''; } $image = $this->images[$match[2]]; /* Width and height are unimportant. Actual resizing is done not in this class. We must save aspect ratio though. */ return "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />"; }, $text); } $footnotes = array(); $footnotes_temp = explode(',;,', $this->footnotes); for ($i = 0; $i < sizeof($footnotes_temp); $i++) { if (is_numeric($footnotes_temp[$i])) { $footnotes[$footnotes_temp[$i]] = $footnotes_temp[$i + 1]; $i++; } } $text = trim($text); $epubText = "<html>\n\t<body>\n\t\t{$descr['coverpage']}\n\t\t{$descr['author']}\n\t\t{$descr['sequence']}\n\t {$descr['annotation']}\n\t\t{$credit}\n\t\t{$text}\n\t</body>\n\t</html>"; $epubText = preg_replace_callback('@(<span[^>]*><a href="#cite_note-(\\d*)"[^>]*>.{0,15}</span>)@', function ($match) use(&$footnotes) { $footnote = $footnotes[$match[2]]; $footnote = preg_replace('@</p>\\s*<p[^>]*>@', '<br/>', $footnote); if ($footnote) { return '<footnote>' . $footnote . '</footnote>'; } else { return $match[1]; } }, $epubText); //preg_replace('@cite_note-(\d*)@',"<footnote></footnote>", $epubText); //echo '<xmp>'.$epubText; //echo $footnotes[137603266]; //exit; //echo '<xmp>'.$epubText; //exit; $epubText = preg_replace('@section@', "div", $epubText); /* Delete extra <br/> tag before images */ $epubText = preg_replace('@<div>(.){0,20}<br\\/>(.){0,20}<img src@', '<div><img src', $epubText); /* Eliminate caret return before <h1> (Each div starts with caret return in h2d_htmlconverter.php) */ $epubText = preg_replace('@\\s*<div>(.{0,40})(<h1>.*?<\\/h1>)@', '\\1\\2<div>', $epubText); /* NGNL Specific names */ //$text=str_replace('<span style="position: relative; text-indent: 0;"><span style="display: inline-block; font-style: normal">『   』</span><span style="position: absolute; font-size: .7em; top: -11px; left: 50%"><span style="position: relative; left: -50%;">','『<sup>',$text); //$text=str_replace('</span></span></span>','</sup>』',$text); // Styles of elements in which footnote is nested should not count. Thus close them $epubText = preg_replace('@pb@', "br", $epubText); //echo '<xmp>'.$epubText; //exit; //PHPWord doesn't support tags nested in link element. Unnest images from them $epubText = preg_replace('@<a[^>]*>(<img[^>]*>)<\\/a>@', "\\1", $epubText); // Delete extra page breaks related to images. $epubText = preg_replace('@<div[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/div>@', "\\1\\2\\3", $epubText); $epubText = preg_replace('@<p[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/p>@', "\\1\\2\\3", $epubText); /* Swap h2 and img tags if img follows h2. (It gave a bad look in docx). */ $epubText = preg_replace('@(<h2>.{0,100}<\\/h2>)(<img[^>]*>)@', '\\2\\1', $epubText); /* After swap we often needs to further lift img tag in previous <div> or <p> tag */ $epubText = preg_replace('@<\\/div>(<img[^>]*>)<h2@', '\\1</div><h2', $epubText); $epubText = preg_replace('@<\\/p>(<img[^>]*>)<h2@', '\\1</p><h2', $epubText); //echo '<xmp>'.$epubText; //exit; $phpword_object = new \PhpOffice\PhpWord\PhpWord(); \PhpOffice\PhpWord\Settings::setCompatibility(false); $html_dom = new \simple_html_dom(); $html_dom->load($epubText); $html_dom_array = $html_dom->find('html', 0)->children(); $paths = htmltodocx_paths(); $initial_state = ['phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => ['size' => '11'], 'parents' => [0 => 'body'], 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => true, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => true, 'treat_div_as_paragraph' => true, 'structure_headings' => true, 'structure_document' => true, 'style_sheet' => htmltodocx_styles_example()]; htmltodocx_insert_html($phpword_object, $html_dom_array[0]->nodes, $initial_state); //var_dump($html_dom_array[0]->nodes); // exit; $html_dom->clear(); unset($html_dom); $h2d_file_uri = tempnam(sys_get_temp_dir(), 'htd'); /*if ($h2d_file_uri === false) { var_dump(sys_get_temp_dir()); }*/ $objWriter = \PhpOffice\PhpWord\IOFactory::createWriter($phpword_object, 'Word2007'); $objWriter->save($h2d_file_uri); $bin = file_get_contents($h2d_file_uri); unlink($h2d_file_uri); //echo 'sdfjnsdlkvjn'; //exit; return $bin; }
function process_page($html) { $dom = new simple_html_dom(); $dom->load($html); $apps = array(); global $authority_code; global $nearby_api_key; foreach ($dom->find("table[class='AppDetailsTable'] tr") as $row) { # Man, this is hacky, but I'm not using dom here in case 'td' shows in plaintext of var if (stristr($row, 'FINALISED') || stristr($row, 'CONDITIONAL') || stristr($row, 'APPEALED') || stristr($row, 'WITHDRAWN') || stristr($row, 'NEW<') || stristr($row, 'APPROVED') || stristr($row, 'REFUSED')) { $appref = $authority_code . substr($row->children[0]->plaintext, 0, 2) . "/" . substr($row->children[0]->plaintext, 2); $rawappref = trim($row->children[0]->plaintext); $url = "http://planning.corkcity.ie/InternetEnquiry/rpt_ViewApplicDetails.asp?validFileNum=1&app_num_file=" . $rawappref; $rawdate = substr($row->children[4]->plaintext, 0, 10); $date = substr($rawdate, -4) . "-" . substr($rawdate, 3, 2) . "-" . substr($rawdate, 0, 2); $applicant = trim($row->children[5]->plaintext); $address = str_replace("<br>", ",", str_replace("<BR>", ",", $row->children[6]->innertext)); #print $row; #print "row"; #print $row; #print $row->children[15]->innertext; #$sizedetails = $row->children[15]->innertext; # Now fetch additional information. Part one, full description of plan $fullapphtml = scraperwiki::scrape($url); $fullappdom = new simple_html_dom(); $fullappdom->load($fullapphtml); $fullappdetails = $fullappdom->find("table[class='AppDetailsTable'] tr", 15)->children(1)->plaintext; #print $fullappdetails; unset($fullapphtml); $sizehtml = scraperwiki::scrape($url); $sizedom = new simple_html_dom(); $sizedom->load($sizehtml); $signifdetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(1)->plaintext; $sizedetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(4)->plaintext; unset($sizehtml); if (strpos($fullappdetails, "Protected Structure") !== false) { $protected = "Protected Structure"; } else { $protected = ""; } #print $protected; $spam_found = false; # mobile,council,gov etc $business = array("retail", "Hotel", "Ltd", "Limited", " shop", " shop"); foreach ($business as $businessword) { if (strrpos($row, $businessword)) { $spam_found = true; break; } } if ($spam_found) { $category = "Business"; } else { if (strpos($fullappdetails, "dwelling") !== false) { $category = "residential"; } else { $category = ""; } } #print $category; # Part two, location of application $lochtml = scraperwiki::scrape('http://planning.corkcity.ie/InternetEnquiry/rpt_ViewSiteLocDetails.asp?page_num=0&file_number=' . $rawappref); if (!stristr($lochtml, "No Site Location Details Found")) { $locdom = new simple_html_dom(); $locdom->load($lochtml); $locnorthing = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(1)->plaintext)); $loceasting = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(4)->plaintext)); # Part three, convert E&N to WGS84 using geograph class $c = new ConversionsLatLong(); $res = $c->irish_to_wgs84($loceasting, $locnorthing); $lat = $res[0]; $long = $res[1]; $locdom->clear(); unset($locdom); unset($lochtml); $apps["{$appref}"] = array('url' => $url, 'appref' => $appref, 'date' => $date, 'applicant' => $applicant, 'address' => $address, 'details' => $fullappdetails, 'signif' => $signifdetail, 'size' => $sizedetail, 'category' => $category, 'protected' => $protected, 'latitude' => $lat, 'longitude' => $long); } } } $dom->clear(); unset($dom); return $apps; }
for ($i = 0; $i < 3; ++$i) { $dom = file_get_dom($filename); //stat_dom($dom); $dom->clear(); unset($dom); dump_memory(); flush(); } echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>'; flush(); echo '<br><br>[one object]<br>init memory: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>'; echo '------------------------------------------<br>'; flush(); $dom = new simple_html_dom(); for ($i = 0; $i < 3; ++$i) { $dom->load_file($filename); $dom->clear(); dump_memory(); } unset($dom); echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>'; flush(); echo '<br><br>[multi objects without clear memory]<br>init memory: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>'; echo '------------------------------------------<br>'; flush(); for ($i = 0; $i < 3; ++$i) { $dom = file_get_dom($filename); dump_memory(); } echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>'; flush();
/** * 解析下返回来的信息 * @return string 解析成功后的信息 */ public function parse() { require_once dirname(__FILE__) . '/simple_html_dom.php'; $data = $this->requestURL(); if (empty($data) || strlen($data < 100)) { return $data; } //如过抓取到的内容是空的说明cookie失效了。 $html = new simple_html_dom(); $html->load($data); $ymd = $html->find('.time-d'); $his = $html->find('.time-h'); $title = $html->find('.consume-title a'); $trade = $html->find('td.tradeNo p'); $name = $html->find('p.name'); $amount = $html->find('td.amount span'); if (!$trade) { return 'no_order'; } $info = array(); foreach ($ymd as $key => $value) { //只要订单数字部分 preg_match('/\\d+/', $trade[$key]->innertext, $tradeNo); //这里可以添加一些逻辑判断语句,例如存到数据库里面遍历查询这个订单是否已经通知成功 $info[] = array('time' => trim($ymd[$key]->innertext) . ' ' . trim($his[$key]->innertext), 'title' => trim($title[$key]->innertext), 'trade' => trim($tradeNo[0]), 'name' => trim($name[$key]->innertext), 'amount' => trim(str_replace('+', '', $amount[$key]->innertext))); } $html->clear(); return $info; }
function getDetailArticle($arrUrl, $arrClass, $arrImgExpert, $folder_name, $arrPregReplace, $arrStrReplace, $domain = '', $classMore = '') { $detailArr = array(); $url = $arrUrl['url']; if ($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); $result = curl_exec($ch); curl_close($ch); // Create a DOM object $html = new simple_html_dom(); // Load HTML from a string $html->load($result); $detailArr = array(); $detailArr['url'] = $url; if ($html->find($arrClass['title'], 0)) { $detailArr['title'] = preg_replace('#<span (.*?)</span>#', ' ', $html->find($arrClass['title'], 0)->innertext()); } else { return $detailArr; } $detailArr['title'] = strip_tags($detailArr['title']); $content = $html->find($arrClass['content'], 0); if ($content != NULL) { foreach ($content->find('img') as $img) { $remove = strstr($img->src, '?'); $img->src = str_replace($remove, "", $img->src); $tenfile = basename($img->src); $arrPartImage = explode('.', $tenfile); // Get image extension $imgExt = array_pop($arrPartImage); // Get image not extension $imgs = preg_replace('/(.*)(_\\d+x\\d+)/', '$1', implode('.', $arrPartImage)); $imgs = $this->changeTitle($imgs); $name = "{$imgs}.{$imgExt}"; if (!is_dir("../uploads/" . date('Y/m/d') . "/" . $folder_name . "/")) { mkdir("../uploads/" . date('Y/m/d') . "/" . $folder_name . "/", 0777, true); } $pathfile = "../uploads/" . date('Y/m/d') . "/" . $folder_name . "/" . $tenfile; $img->src = $pathfile; if ($urlHinh == '') { $urlHinh = $img->src; } $img->class = "aligncenter"; } } if ($content) { $contentHtml = $content->innertext(); } if (!empty($arrPregReplace)) { foreach ($arrPregReplace as $preg) { $contentHtml = preg_replace($preg, ' ', $contentHtml); } } if (!empty($arrStrReplace)) { foreach ($arrStrReplace as $strre) { $contentHtml = str_replace($strre, ' ', $contentHtml); } } $detailArr['content'] = $contentHtml; if ($arrClass['description'] == '') { $tmp = strip_tags($contentHtml); $detailArr['description'] = $this->string_limit($tmp, 255); } else { $detailArr['description'] = $html->find($arrClass['description'], 0)->innertext(); } if ($arrUrl['thumbnailUrl'] == '') { $detailArr['thumbnailUrl'] = $urlHinh; } else { $detailArr['thumbnailUrl'] = $arrUrl['thumbnailUrl']; } $html->clear(); unset($html); } return $detailArr; }
if (isset($_GET['zkzh'])) { $src = 'http://www.chsi.com.cn/cet/query'; $id = $_GET['zkzh']; $name = $_GET['xm']; $ch = curl_init(); curl_setopt_array($ch, array(CURLOPT_URL => $src . '?zkzh=' . $id . '&xm=' . $name, CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => false, CURLOPT_REFERER => 'http://www.chsi.com.cn/cet/')); $content = curl_exec($ch); if (curl_errno($ch) == 0) { $html = new simple_html_dom(); $html->load($content); $table = $html->find('table[class=cetTable]', 0); if (!$table) { $str = "请确认姓名或准考证号是否正确!"; } else { $text = $table->outertext; $html->clear(); $table->clear(); unset($html); $str = $text; $str = str_replace('<table border="0" align="center" cellpadding="0" cellspacing="6" class="cetTable"> <tr> <th>', "", $str); $str = str_replace('</th> <td>', "", $str); $str = str_replace('</td> </tr> <tr> <th>', "\n", $str); $str = str_replace('<strong><span style="color: #F00;">', "", $str); $str = str_replace('</span>', "", $str); $str = str_replace(' ', "", $str); $str = str_replace('<span class="color01">', "\n", $str); $str = str_replace('</strong></td> </tr> </table>', "\n\n查询数据来源于学信网\nOURStudio提供技术支持.", $str); } echo $str; } else { echo curl_error($ch);
private function process_special_content($url, $domain, $content) { //某些网站图片路径补全处理 if ($domain == 'history.people.com.cn') { $content = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domain . "\$1" . '"', $content); } else { if ($domain == 'blog.sina.com.cn') { //新浪图片,需要把real_src和src属性互相特换 $new_html = new simple_html_dom(); $new_html->load($content); $imgs = $new_html->find('img'); foreach ($imgs as &$img) { $img->src = $img->real_src; $img->real_src = null; } $content = $new_html->innertext; $new_html->clear(); } else { if ($domain == 'www.nowamagic.net') { $base = 'http://www.nowamagic.net/librarys/'; $content = preg_replace("/src=\"\\.\\.\\/\\.\\.\\/(.*?)\"/i", 'src="' . $base . "\$1" . '"', $content); } else { if ($domain == 'www.jfdaily.com') { $content = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domain . "\$1" . '"', $content); } } } } return $content; }
} } $subcat2->clear(); unset($subcat2); } } $subcat1->clear(); unset($subcat1); } } $category->clear(); unset($category); } } } $cat_url->clear(); unset($cat_url); echo "<p>Всего спарсили url " . $cat_counter . " категорий </p>"; fclose($cat_handle); /* if (($handle = fopen("price.csv", "r")) !== FALSE) { while (($data = fgetcsv($handle, 1000, ";")) !== FALSE) { $articuls[] = $data[2]; } foreach ($articuls as $articul){ } fclose($handle); } */
public function fetchDataFromPriceGrabberApi($upc) { ini_set('max_execution_time', 999999); ini_set('memory_limit', '400M'); require_once dirname(BASEPATH) . '/system/application/libraries/simple_html_dom.php'; $debug = false; $url = 'http://www.pricegrabber.com/search_request.php?form_keyword=' . $upc . '&some_id=&id_type=&requestParams=Tjs%3D&vendorIds=YTowOnt9&st=query&sv=findit_top&kw_suggest=0&topcat_menu=6&zip_code=54001'; if ($debug) { echo '<br />' . $url . '<br />'; } $html = new simple_html_dom(); $html->load_file($url); if ($html->find('div[class=product_description]', 0)) { $name = $html->find('div[class=product_description]', 0)->find('h1', 0)->plaintext; } else { $html->clear(); return array(); } $desc = ''; if ($html->find('p[id=product_details_description]', 0)) { $desc = $html->find('p[id=product_details_description]', 0)->plaintext; } $product_image = ''; if ($html->find('div[class=product_img]', 0)) { $product_image = $html->find('div[class=product_img]', 0)->find('img', 0)->getAttribute('src'); } $items = array(); if ($html->find('table[class=pricing_tbl]', 0)) { $price_table = $html->find('table[class=pricing_tbl]', 0); } else { $html->clear(); return array(); } if ($debug) { echo $price_table; } foreach ($price_table->find('tr') as $tr) { $mystring = $tr->getAttribute('class'); $findme = 'section'; $pos = strpos($mystring, $findme); if ($pos !== false) { continue; } if ('noseller' == $tr->find('td', 0)->getAttribute('class')) { break; } $bottom_price = $this->getPriceForPriceGrabber($tr->find('td', 1)->find('div[class=deftip]', 0)->plaintext); $price = $this->getPriceForPriceGrabber($tr->find('td', 2)->plaintext); $shopping_price = $bottom_price - $price; $seller_link = $tr->find('td', 4)->find('a', 0); $href = $seller_link->getAttribute('href'); $img = $seller_link->find('img', 0); if ($img) { $menu = $img->getAttribute('alt'); $logo = $img->getAttribute('src'); } else { $menu = $seller_link->plaintext; $logo = ''; } $data = array('name' => $name, 'desc' => $desc, 'manu' => $menu, 'url' => $href, 'productImage' => $product_image, 'basePrice' => trim($price), 'shippingPrice' => trim($shopping_price), 'price' => trim($price), 'logo' => $logo); $items[] = $data; if ($debug) { echo '<br />'; echo '--------------------------'; echo '<br />'; echo '<pre>'; print_r($data); echo '</pre>'; echo '<br />'; echo 'Name: ' . $name; echo '<br />'; echo 'Desc: ' . $desc; echo '<br />'; echo 'Bottom Price: ' . $bottom_price; echo '<br />'; echo 'Price: ' . $price; echo '<br />'; echo 'Shipping Price: ' . $shopping_price; echo '<br />'; echo 'Seller Name: ' . $menu; echo '<br />'; echo 'Seller Image: ' . $logo; echo '<br />'; echo 'Store Link: ' . $href; echo '<br />'; echo '--------------------------'; echo '<br />'; } } $html->clear(); return $items; }
/** * * Remove from memory * */ public function __destruct() { $this->_html->clear(); $this->_editHtml->clear(); unset($this->_response); }
public function getXml() { $openid = trim($_REQUEST['openid']); $wxid = trim($_REQUEST['wxid']); if (!$openid) { echo "no openid"; exit; } $url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" . $openid; $json = file_get_html($url); $json = stripslashes($json); preg_match('/\\"totalItems\\"\\:(\\d+)/', $json, $matches); $itemCount = $matches[1]; preg_match('/\\"totalPages\\"\\:(\\d+)/', $json, $matches); $pageCount = $matches[1]; if (intval($itemCount) == 0) { echo "该公众号没有发布文章"; exit; } $tmp = array(); $k = 0; for ($j = 0; $j < 2; $j++) { if ($j == 0) { preg_match_all("/<url>(.*?)<\\/url>/i", $json, $links, PREG_PATTERN_ORDER); preg_match_all("/<content>(.*?)<\\/content>/i", $json, $contents, PREG_PATTERN_ORDER); for ($i = 0; $i < count($links[1]); $i++) { $url = ltrim($links[1][$i], '<![CDATA['); $url = rtrim($url, ']]>'); $content = ltrim($contents[1][$i], '<![CDATA['); $content = rtrim($content, ']]>'); $tmp[$k]['url'] = $url; $tmp[$k]['content'] = $content; $k++; } } else { $url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" . $openid . '&page=' . ($j + 1); $json = file_get_html($url); $json = stripslashes($json); preg_match_all("/<url>(.*?)<\\/url>/i", $json, $links, PREG_PATTERN_ORDER); preg_match_all("/<content>(.*?)<\\/content>/i", $json, $contents, PREG_PATTERN_ORDER); for ($i = 0; $i < count($links[1]); $i++) { $url = ltrim($links[1][$i], '<![CDATA['); $url = rtrim($url, ']]>'); $content = ltrim($contents[1][$i], '<![CDATA['); $content = rtrim($content, ']]>'); $tmp[$k]['url'] = $url; $tmp[$k]['content'] = $content; $k++; } } } $mh = curl_multi_init(); for ($m = 0; $m < count($tmp); $m++) { $conn[$i] = curl_init($tmp[$m]['url']); curl_setopt($conn[$i], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"); curl_setopt($conn[$i], CURLOPT_HEADER, 0); curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT, 60); curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, true); // 设置不将爬取代码写到浏览器,而是转化为字符串 curl_multi_add_handle($mh, $conn[$i]); } do { curl_multi_exec($mh, $active); } while ($active); for ($m = 0; $m < count($tmp); $m++) { $data = curl_multi_getcontent($conn[$m]); // 获得爬取的代码字符串 $a = new simple_html_dom(); $a->load($data); $title = $a->find('h1#activity-name', 0)->outertext; $content = $a->find('div#page-content', 0)->outertext; $article = array('title' => $title, 'content' => $content); $a->clear(); $data = array('wxh' => $wxid, 'openId' => $openid, 'link' => $tmp[$m]['url'], 'title' => $article['title'], 'summary' => $tmp[$m]['content'], 'content' => $article['content']); $this->weixin->_create($data); echo "save success" . $m . '<br>'; } for ($m = 0; $m < count($tmp); $m++) { curl_multi_remove_handle($mh, $conn[$m]); curl_close($conn[$i]); } curl_multi_close($mh); unset($tmp); }
$html_dom->load('<html><body>' . $html . '</body></html>'); // Note, we needed to nest the html in a couple of dummy elements. // Create the dom array of elements which we are going to work on: $html_dom_array = $html_dom->find('html', 0)->children(); // We need this for setting base_root and base_path in the initial_state array // (below). We are using a function here (derived from Drupal) to create these // paths automatically - you may want to do something different in your // implementation. This function is in the included file // documentation/support_functions.inc. $paths = htmltodocx_paths(); // Provide some initial settings: $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example()); // Convert the HTML and put it into the PHPWord object htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state); // Clear the HTML dom object: $html_dom->clear(); unset($html_dom); // Save File $h2d_file_uri = tempnam('', 'htd'); $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007'); $objWriter->save($h2d_file_uri); // Download the file: header('Content-Description: File Transfer'); header('Content-Type: application/octet-stream'); header('Content-Disposition: attachment; filename=example.docx'); header('Content-Transfer-Encoding: binary'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); header('Content-Length: ' . filesize($h2d_file_uri)); ob_clean();
function process_page($html) { $dom = new simple_html_dom(); $dom->load($html); $apps = array(); global $authority_code; global $nearby_api_key; foreach ($dom->find("table[class='AppDetailsTable'] tr") as $row) { # Man, this is hacky, but I'm not using dom here in case 'td' shows in plaintext of var if (stristr($row, 'FINALISED') || stristr($row, 'CONDITIONAL') || stristr($row, 'APPEALED') || stristr($row, 'WITHDRAWN') || stristr($row, 'NEW<') || stristr($row, 'APPROVED') || stristr($row, 'REFUSED')) { $appref = $authority_code . substr($row->children[0]->plaintext, 0, 2) . "/" . substr($row->children[0]->plaintext, 2); $rawappref = trim($row->children[0]->plaintext); $url = "http://planning.corkcity.ie/InternetEnquiry/rpt_ViewApplicDetails.asp?validFileNum=1&app_num_file=" . $rawappref; $rawdate = substr($row->children[4]->plaintext, 0, 10); $date = substr($rawdate, -4) . "-" . substr($rawdate, 3, 2) . "-" . substr($rawdate, 0, 2); $applicant = trim($row->children[5]->plaintext); $address = str_replace("<br>", ",", str_replace("<BR>", ",", $row->children[6]->innertext)); # Now fetch additional information. Part one, full description of plan $fullapphtml = scraperwiki::scrape($url); $fullappdom = new simple_html_dom(); $fullappdom->load($fullapphtml); $fullappdetails = $fullappdom->find("table[class='AppDetailsTable'] tr", 15)->children(1)->plaintext; unset($fullapphtml); # Part two, location of application $lochtml = scraperwiki::scrape('http://planning.corkcity.ie/InternetEnquiry/rpt_ViewSiteLocDetails.asp?page_num=0&file_number=' . $rawappref); if (!stristr($lochtml, "No Site Location Details Found")) { $locdom = new simple_html_dom(); $locdom->load($lochtml); $locnorthing = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(1)->plaintext)); $loceasting = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(4)->plaintext)); # Part three, convert E&N to WGS84 using geograph class $c = new ConversionsLatLong(); $res = $c->irish_to_wgs84($loceasting, $locnorthing); $lat = $res[0]; $long = $res[1]; $locdom->clear(); unset($locdom); unset($lochtml); $apps["{$appref}"] = array('url' => $url, 'appref' => $appref, 'date' => $date, 'applicant' => $applicant, 'address' => $address, 'details' => $fullappdetails, 'latitude' => $lat, 'longitude' => $long); } } } $dom->clear(); unset($dom); return $apps; }
$idCidPoli = str_replace('\'', '', $idCidPoli); $idCidPoli = str_replace(');', '', $idCidPoli); //guarda a posição do espaço que separa o id do prefeito ou vereador do id do municipio $espaco = strripos($idCidPoli, ' '); //guarda o id do prefeito ou do vereador que é 11 ou 13 $codigoCargo = substr($idCidPoli, 0, $espaco); //guarda o id da cidade $codigoMunicipio = substr($idCidPoli, $espaco + 1); //modifica a url do ajax que é exibida na tela $urlAjaxPrefeitoVereador = "http://divulgacand2012.tse.jus.br/divulgacand2012/pesquisarCandidato.action?siglaUFSelecionada=" . $siglaUF . "&codigoMunicipio=" . $codigoMunicipio . "&codigoCargo=" . $codigoCargo . "&codigoSituacao=0"; $htmlCidade = new simple_html_dom(); //carrega o html que possui todos prefeitos ou vereadores da cidade $htmlCidade->load_file($urlAjaxPrefeitoVereador); //pega os input com o id e a ultima atualização do politico $candidato = $htmlCidade->find('tr[class="odd gradeX"] input'); $htmlCidade->clear(); unset($htmlCidade); //array para guardar ids dos candidatos e ids da ultima atualização do candidato $array = array("sqCandidato", "dtUltimaAtualizacao"); $i = 0; $j = 0; foreach ($candidato as $elemento) { if (strcmp($elemento->name, "sqCandidato") == 0) { $array["sqCandidato"][$i] = $elemento->value; $i++; } else { $array["dtUltimaAtualizacao"][$j] = $elemento->value; $j++; } } if ($i != $j) {
function job51($username, $password) { //配置URl 登录信息 $login = "******"; $post_file = "username="******"&userpwd=" . $password; $cookie_file = tempnam('./temp', 'cookie'); //初始化curl $ch = curl_init($login); /////初始化一个CURL对象 curl_setopt($ch, CURLOPT_HEADER, 0); ///设置不输出在浏览器上 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_POST, 1); ////传递一个作为HTTP "POST"操作的所有数据的字符串 curl_setopt($ch, CURLOPT_POSTFIELDS, $post_file); /////把返回来的cookie信息保存在$cookie_jar文件中 curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file); $loginInfo = curl_exec($ch); ///执行 curl_close($ch); ////关闭 //现在已经是登录状态 $domain = "http://www.51job.com"; $url = "http://my.zhaopin.com/myzhaopin/resume_list.asp"; //http://my.51job.com/sc/applyjob/preview_resume.php?ReSumeID=330146538&AccountID=100707500 $ch = curl_init($domain); //防止重定向 curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); ///设置不输出在浏览器上 curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file); return "success"; $contents = curl_exec($ch); $html = new simple_html_dom(); $html->load($contents); $resumePage = $html->find('.orange'); if (isset($resume)) { return "登录失败"; } else { return $resumePage; } $resumeLink = $resumePage[0]->href; $html->clear(); curl_close($ch); ////关闭 $ch = curl_init($resumeLink); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); ///设置不输出在浏览器上 curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file); $contents = curl_exec($ch); //icon18 iconSee $html->load($contents); $resume = $html->find('a[onfocus]'); $url = $resume[22]->href; $html->clear(); curl_close($ch); ////关闭 $ch = curl_init($url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); ///设置不输出在浏览器上 curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file); $contents = curl_exec($ch); //icon18 iconSee return $contents; }
function create_dataset2($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); #controllo se esiste veramente prima di entrare $table = $dom->find('table', 2); if (isset($table)) { foreach ($dom->find('table', 2)->children() as $data) { echo "parsing info tabella principale"; if ($data != null) { $res = trim($data->plaintext); } if ($i > 0 && strlen($res) > 0) { # Store data in the datastore #print $res; $res = str_replace(''', "'", $res); #splitto i risultati in un array $array_result = split(' ', $res); #print_r($res); #echo $denom; # Mi salvo il codiceMPI $codMPI = trim($array_result[1]); $url_MPI = "http://www.trampi.istruzione.it/ricScu/dettaglio.do?cod=" . $codMPI; #print $url_MPI."\n"; $html = scraperwiki::scrape($url_MPI); $dom_mpi = new simple_html_dom(); $dom_mpi->load($html); $tel = ""; $fax = ""; $email = ""; $web = ""; $indS = ""; $tr = $dom_mpi->find('table[cellspacing=1] tr'); if (isset($tr)) { foreach ($dom_mpi->find('table[cellspacing=1] tr') as $data_mpi) { $res = $data_mpi->plaintext . "\n"; $values = split(':', $res); #print_r($values); if (strlen($values[0]) > 0) { if (stripos($values[0], 'tel') !== false) { $tel = trim($values[1]); #print "tel:".$tel."\t"; } else { if (stripos($values[0], 'fax') !== false) { $fax = trim($values[1]); #print "fax:".$fax."\t"; } else { if (stripos($values[0], 'e-mail') !== false) { $email = trim($values[1]); } else { if (stripos($values[0], 'web') !== false) { while (list($key, $value) = each($values)) { if ($key = 2) { $web = $values[1] . ":" . $value; } } } else { if (stripos($values[0], 'studio') !== false) { $indS = str_replace('</td>', '', $values[1]); $indS = str_replace('</tr>', '', $indS); $indS = str_replace(array("\r", "\t", "\n"), '', $indS); $indS = trim($indS); #print "ind studio:".$indS."\n"; } } } } } #echo $web."\n"; } } unset($values); } $dom_mpi->clear(); unset($dom_mpi); $dataset = array('denominazione' => trim(html_entity_decode($array_result[0])), 'codiceMPI' => trim($array_result[1]), 'tipologia' => trim(html_entity_decode($array_result[2])), 'tipologiaIIgrado' => trim(html_entity_decode($array_result[3])), 'descrizione' => trim(html_entity_decode($array_result[4])), 'indirizzo' => trim(html_entity_decode($array_result[5])), 'località' => trim(html_entity_decode($array_result[6])), 'cap' => trim($array_result[7]), 'comune' => trim(html_entity_decode($array_result[8])), 'provincia' => trim(html_entity_decode($array_result[9])), 'regione' => trim(html_entity_decode($array_result[10])), 'codIstitutoComprensivo' => trim(html_entity_decode($array_result[11])), 'telefono' => $tel, 'fax' => $fax, 'email' => $email, 'web' => $web, 'IndirizziStudio' => trim(html_entity_decode($indS))); #print_r($dataset); #scraperwiki::save(array('data'), array('data' => $data->plaintext)); if (strlen($dataset['denominazione']) > 1) { scraperwiki::save(array('denominazione', 'codiceMPI'), $dataset); } unset($dataset); unset($res); unset($tel); unset($fax); unset($email); unset($web); unset($indS); } $i = $i + 1; } #dealloco il dom sennò schianta $dom->clear(); unset($dom); } }
public static function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT) { $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText); if (empty($str)) { $dom->clear(); return false; } $dom->load($str, $lowercase, $stripRN); return $dom; }