clear() public method

..
public clear ( )
function clubURL($url)
{
    $html = scraperwiki::scrape($url);
    $dom = new simple_html_dom();
    $dom->load($html);
    $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext));
    $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName));
    $_GLOBAL['clubs'][] = $formatClubName;
    echo 'running ' . $formatClubName . "\n";
    foreach ($dom->find('table', 2)->find('tr') as $row) {
        if (is_numeric($row->find('td', 0)->plaintext)) {
            $year = trim($row->find('td', 0)->plaintext);
            $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext));
            if (trim($position) == 'Champion') {
                $position = 1;
            }
            $leagueLevel = trim($row->find('td', 2)->plaintext);
            $overallPosition = trim($row->find('td', 3)->plaintext);
            $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext));
            $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext));
            $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance);
            scraperwiki::save(array('club', 'year'), $dataset);
        }
    }
    /*
     * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak
     */
    $dom->clear();
    unset($dom);
}
Ejemplo n.º 2
0
 public function save($html, $dir)
 {
     import("@.ORG.htmltodocx.documentation.support_functions");
     $phpword_object = new PHPWord();
     $section = $phpword_object->createSection();
     // HTML Dom object:
     $html_dom = new simple_html_dom();
     $html_dom->load('<html><body>' . $html . '</body></html>');
     // Note, we needed to nest the html in a couple of dummy elements.
     // Create the dom array of elements which we are going to work on:
     $html_dom_array = $html_dom->find('html', 0)->children();
     // We need this for setting base_root and base_path in the initial_state array
     // (below). We are using a function here (derived from Drupal) to create these
     // paths automatically - you may want to do something different in your
     // implementation. This function is in the included file
     // documentation/support_functions.inc.
     $paths = htmltodocx_paths();
     // Provide some initial settings:
     $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example());
     // Convert the HTML and put it into the PHPWord object
     htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
     // Clear the HTML dom object:
     $html_dom->clear();
     unset($html_dom);
     // Save File
     $str = explode(".", $h2d_file_uri);
     $h2d_file_uri = $dir . "wordtemp/" . time() . ".docx";
     if (!file_exists($dir . "wordtemp/")) {
         $this->createFolders($dir . "wordtemp/");
         //判断目标文件夹是否存在
     }
     $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
     $objWriter->save($h2d_file_uri);
     return $h2d_file_uri;
 }
Ejemplo n.º 3
0
 public function parse($isUpdate = false)
 {
     Ibos::import("application.extensions.simple_html_dom", true);
     if ($isUpdate) {
         $model = preg_replace("/\\s+data-id\\s?=\\s?\"?\\d+\"?/i", "", $this->printmodel);
         $max = 0;
     } else {
         $model = $this->printmodel;
         $max = intval($this->itemmax);
     }
     $elements = array();
     $doc = new simple_html_dom();
     $doc->load($model, true, true, CHARSET);
     $items = $doc->find("ic");
     $config = $this->getItemConfig();
     if (!empty($items) && !empty($config)) {
         $this->refactor($items, $config, $max, $elements);
     }
     $html = $doc->save();
     $this->_cache = $elements;
     CacheUtil::set("form_" . $this->ID, $elements);
     $form["printmodelshort"] = $html;
     if ($max != $this->itemmax) {
         $form["itemmax"] = $max;
     }
     $doc->clear();
     FlowFormType::model()->modify($this->ID, $form);
 }
function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT) {
    $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
    if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
        $dom->clear();
        return false;
    }
    $dom->load($str, $lowercase, $stripRN);
    return $dom;
}
Ejemplo n.º 5
0
function generate_docx($html, $file_path, &$file_takeout_tmp_files)
{
    $phpword_object = new PHPWord();
    $section = $phpword_object->createSection();
    $html_dom = new simple_html_dom();
    $html_dom->load($html);
    $html_dom_array = $html_dom->find('html', 0)->children();
    $paths = htmltodocx_paths();
    $initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => FALSE, 'style_sheet' => htmltodocx_styles(), 'download_img_path' => elgg_get_data_path(), 'download_img_tmp' => &$file_takeout_tmp_files);
    htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
    $html_dom->clear();
    unset($html_dom);
    $objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
    // Word2007 is the only option :-(
    $objWriter->save($file_path);
}
Ejemplo n.º 6
0
 private function clean_children(&$a_sHTML)
 {
     $l_sTmp = '<crawler>' . $a_sHTML . '</crawler>';
     $l_oTheHtml = new simple_html_dom();
     $l_oTheHtml->load($l_sTmp);
     $l_sResult = $l_oTheHtml->find('crawler', 0);
     $x = (string) $l_sResult->innertext;
     for ($i = 0; $i < sizeof($l_sResult->children()); $i++) {
         $x = str_replace($l_sResult->children($i), '', $x);
     }
     $l_oTheHtml->clear();
     unset($l_sTmp);
     unset($l_sResult);
     unset($l_oTheHtml);
     return $x;
 }
Ejemplo n.º 7
0
 function add_h_filter($foo)
 {
     $source_html = $foo;
     $p = new simple_html_dom();
     $p->load('<html><body>' . $source_html . '<html><body>');
     $hrefs = $p->find("a");
     foreach ($hrefs as $elm) {
         error_log($elm->href);
         $match;
         if (preg_match("/\\?page_id\\=(\\d+?)\$/", $elm->href, $match)) {
             $page_id = $match[1];
             $page = get_page($page_id);
             $content = $page->post_content;
             $p2 = new simple_html_dom();
             $p2->load('<html><body>' . $content . '<html><body>');
             $has_id = $p2->find('h1[id]');
             $submenu_array = array();
             foreach ($has_id as $idh1) {
                 error_log($idh1->id);
                 $h1_id = $idh1->id;
                 $h1_txt = $idh1->plaintext;
                 array_push($submenu_array, array($h1_id, $h1_txt));
             }
             if (sizeof($submenu_array) !== 0) {
                 $submenu = "<ul class='submenu'>\n";
                 $blogurl = get_bloginfo('url');
                 foreach ($submenu_array as $sub) {
                     $submenu .= '<li><a href="' . $blogurl . '?page_id=' . $page_id . '#' . $sub[0] . '">' . $sub[1] . '</a></h1>' . "\n";
                 }
                 $submenu .= '</ul>';
                 $elm->outertext = $elm->outertext . $submenu;
             }
             $p2->clear();
             unset($p2);
         }
     }
     $foo = $p->outertext;
     $p->clear();
     unset($p);
     return $foo;
 }
Ejemplo n.º 8
0
	function addToTable($text,$position,$button){
		$dom = new simple_html_dom();
		$dom->load($text);

		$tableEl = $dom->find('.'.$position,0);
		if(!$tableEl){
			$table = '<table class="'.$position	.' myApiShareTable"></table>';
			$text 	= ($position == 'myApiShareTop') ?  $table.$text : $text.$table;
			$dom->load($text);
		}
		
		$text 	= $dom->save();
		$dom->load($text);
		
		$rowEl = $dom->find('.'.$position,0)->find('.myApiButtons',0);
		if(!$rowEl){
			$tr = '<tr class="myApiButtons"><td><table><tr><td>'.$button.'</td></tr></table></td></tr>';
			$row = $dom->find('.'.$position,0);
			$row->innertext = $tr.$row->innertext;
		}else{
			$rowEl->find('table',0)->find('tr',0)->innertext = '<td>'.$button.'</td>'.$rowEl->find('table',0)->find('tr',0)->innertext;
		}
		
		$text 	= $dom->save();
		$dom->load($text);
		
		$commentsTable = $dom->find('.myApiShareBottom',0);
		if($commentsTable){
			$commentsEl = $commentsTable->find('.myApiCommentsCell',0);
			if($commentsEl){
				$buttonRow = $commentsTable->find('.myApiButtons',0);
				if($buttonRow){
					$commentsEl->colspan = sizeof($buttonRow->find('td'));
					$text 	= $dom->save();
				}
			}
		}
		$dom->clear(); unset($dom);	
		return $text;
	}
Ejemplo n.º 9
0
 public static function _Process_Recieved_Content($_HTML_CONTENT, $_Cung1, $_Cung2, $_Summary, $_SourceUri, $_LinkId, $_ImageLink)
 {
     if ($_HTML_CONTENT != '') {
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($_HTML_CONTENT);
         unset($_HTML_CONTENT);
         if ($html) {
             $story = new Model_Horoscope_XungHop();
             $ktra = true;
             if ($_Cung1 == '-' || $_Cung2 == '-') {
                 $ktra = FALSE;
             }
             $story->cung_1 = $_Cung1;
             $story->cung_2 = $_Cung2;
             $story->alias = $_Cung1 . '_' . $_Cung2;
             if (self::CheckRecordByAlias($story->alias)) {
                 $story->alias = $_Cung1 . '__' . $_Cung2;
             }
             $story->tom_tat = $_Summary;
             $story->ngay_tao = date("Y-m-d");
             $story->url_nguon = $_SourceUri;
             $story->auto_get = true;
             //begin find elements
             #find date post
             $date = $html->find('div[class="datetime"]', 0);
             if ($date) {
                 $d = explode(',', $date->plaintext);
                 if (isset($d[1])) {
                     //var_dump($d);
                     //exit;
                     $d1 = explode(' ', trim($d[1]));
                     list($ngay, $thang, $nam) = explode('/', $d1[0]);
                     $story->source_date = date("Y-m-d h:i:s", strtotime($nam . '-' . $thang . '-' . $ngay . ' ' . $d1[1] . ':00'));
                 } else {
                     $story->source_date = date("Y-m-d h:i:s");
                 }
             } else {
                 $story->source_date = date("Y-m-d h:i:s");
             }
             //find content
             $content = $html->find('div[id="content_document"]', 0);
             if ($content) {
                 $string = $content->innertext;
                 # remove white space
                 $string = str_replace(array("\r\n", "\r", "\n", "\t"), '', $string);
                 $string = preg_replace('/(<!--.+?-->)/s', '', $string);
                 $string = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $string);
                 $string = preg_replace('/<p[ ]class="pAuthor">.*<\\/p>/ismxU', '', $string);
                 $string = preg_replace('/<p[ ]class="pSource">.*<\\/p>/ismxU', '', $string);
                 $story->noi_dung = $string;
                 $story->kiem_tra = $ktra;
                 $story->save();
                 if ($story->identifier()) {
                     if ($ktra) {
                         //get image thumb => save to disk => update record in db
                         $path = 'assets/horoscope/xung-hop/' . $story->alias . '/';
                         $img = Vendor_Crawler::get_file_from_url_by_curl($_ImageLink, $save_to_path = $path, $file_name_to_set = $story->alias . '-thumb');
                         if ($img) {
                             //check file size, if = 0 -> mean file can't get
                             if (filesize($img) == 0) {
                                 @copy('assets/horoscope/thumb_140.jpg', $img);
                             }
                             $story->hinh_anh = '/' . $img;
                         } else {
                             $story->hinh_anh = $_ImageLink;
                         }
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     if ($ktra != FALSE) {
                         //print_r($img);
                         $html2 = new simple_html_dom();
                         $html2->load($story->noi_dung);
                         $images = $html2->find('img');
                         if (count($images) > 0) {
                             for ($i = 0; $i < count($images); $i++) {
                                 unset($images[$i]->onclick);
                                 $file_name = 'anh_' . $i + 1;
                                 $get_file = Vendor_Crawler::get_file_from_url_by_curl($images[$i]->src, $save_to_path = $path, $file_name_to_set = $file_name);
                                 if (filesize(ltrim($get_file, '/')) == 0) {
                                     unset($images[$i]);
                                 } else {
                                     $images[$i]->src = '/' . $get_file;
                                 }
                             }
                         }
                         $story->noi_dung = $html2->save();
                         $html2->clear();
                         unset($html2);
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     $story->save();
                     //insert done => update from tmp table
                     Model_Horoscope_XungHopLinkBLL::UpdateRecordStatus($_LinkId);
                     self::_print_to_console('Done: ' . $_SourceUri);
                 } else {
                     self::_print_to_console('Fail:' . $_SourceUri);
                 }
             } else {
                 self::_print_to_console('-> content not found');
                 return false;
             }
             $html->clear();
             unset($html);
         } else {
             self::_print_to_console('-> cant load DOM obj');
             return false;
         }
     } else {
         self::_print_to_console('-> nothing to do');
         return false;
     }
 }
Ejemplo n.º 10
0
function scrapeHTML($param, $type)
{
    $html = scraperWiki::scrape(BASE_URL . "?type={$param}");
    $dom = new simple_html_dom();
    $dom->load($html);
    // Iterate over table rows and get flight details.
    foreach ($dom->find("TR[@HEIGHT='25']") as $data) {
        // Flight details.
        $tds = $data->find("td");
        $airline = removeSpaces($tds[0]->plaintext);
        $flight_type = $type;
        $flight_num = removeSpaces($tds[1]->plaintext);
        $destination = removeSpaces($tds[2]->plaintext);
        $time = removeSpaces($tds[3]->plaintext);
        $gate = removeSpaces($tds[4]->plaintext);
        $remarks = removeSpaces($tds[5]->plaintext);
        // Skip header row. Cheesy, but effective.
        if ($airline == "Airline") {
            continue;
        }
        // Set the date.
        $date = date("m.d.y");
        // Build up record to store.
        $flight_data = array("date" => $date, "airline" => $airline, "flight_type" => $flight_type, "flight_num" => $flight_num, "destination" => $destination, "time" => $time, "gate" => $gate, "remarks" => $remarks);
        // Save the record.
        saveData(array("date", "airline", "flight_type", "flight_num"), $flight_data);
    }
    $dom->clear();
}
Ejemplo n.º 11
0
 /**
  * New method to parse page content navigating the dom and replacing found elements with modified HTML to acomodate LBP appropriate HTML
  *
  * @param mixed $content
  * @return mixed
  */
 function lightboxPlusReplace($html_content, $unq_id)
 {
     global $post;
     if (!empty($this->lightboxOptions)) {
         $lightboxPlusOptions = $this->getAdminOptions($this->lightboxOptionsName);
     }
     /**
      * Remove following line after a few versions or 2.6 is the prevelent version
      */
     $lightboxPlusOptions = $this->setMissingOptions($lightboxPlusOptions);
     $postGroupID = $post->ID;
     $postGroupTitle = $post->post_title;
     $html = new simple_html_dom();
     $html->load($html_content, false, false);
     /**
      * Find all image links (text and images)
      *
      * If (autolightbox text links) then
      */
     switch ($lightboxPlusOptions['text_links']) {
         case 1:
             foreach ($html->find('a[href*=jpg$], a[href*=gif$], a[href*=png$], a[href*=jpeg$], a[href*=bmp$]') as $e) {
                 /**
                  * Use Class Method is selected - yes/no
                  */
                 switch ($lightboxPlusOptions['output_htmlv']) {
                     case 1:
                         $htmlv_prop = 'data-' . $lightboxPlusOptions['data_name'];
                         switch ($lightboxPlusOptions['use_class_method']) {
                             case 1:
                                 if ($e->class && $e->class != $lightboxPlusOptions['class_name']) {
                                     $e->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->{$htmlv_prop}) {
                                         $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->{$htmlv_prop}) {
                                         $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->{$htmlv_prop}) {
                                     $e->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                     default:
                         switch ($lightboxPlusOptions['use_class_method']) {
                             case 1:
                                 if ($e->class && $e->class != $lightboxPlusOptions['class_name']) {
                                     $e->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->rel) {
                                         $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->rel) {
                                         $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->rel) {
                                     $e->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                 }
                 /**
                  * Do Not Display Title is select - yes/no
                  */
                 switch ($lightboxPlusOptions['no_display_title']) {
                     case 1:
                         $e->title = null;
                         break;
                     default:
                         /**
                          * If title doesn't exist then get a title
                          * Set to caption title->image->post title by default then set to image title is exists
                          */
                         if (!$e->title && $e->first_child()) {
                             if ($e->first_child()->alt) {
                                 $e->title = $e->first_child()->alt;
                             } else {
                                 $e->title = $postGroupTitle;
                             }
                         }
                         /**
                          * If use caption for title try to get the text from the caption - this could be wrong
                          */
                         if ($lightboxPlusOptions['use_caption_title']) {
                             if ($e->next_sibling()->class = 'wp-caption-text') {
                                 $e->title = $e->next_sibling()->innertext;
                             } elseif ($e->parent()->next_sibling()->class = 'gallery-caption') {
                                 $e->title = $e->parent()->next_sibling()->innertext;
                             }
                         }
                         break;
                 }
             }
             break;
         default:
             /**
              *  find all links with image only else if (do not autolightbox textlinks) then
              */
             foreach ($html->find('a[href*=jpg$] img, a[href*=gif$] img, a[href*=png$] img, a[href*=jpeg$] img, a[href*=bmp$] img') as $e) {
                 /**
                  * Generate HTML5 yes/no
                  */
                 switch ($lightboxPlusOptions['output_htmlv']) {
                     case 1:
                         $htmlv_prop = 'data-' . $lightboxPlusOptions['data_name'];
                         switch ($lightboxPlusOptions['use_class_method']) {
                             /**
                              * Use Class Method is selected - yes/no
                              */
                             case 1:
                                 if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) {
                                     $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->{$htmlv_prop}) {
                                         $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->parent()->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->{$htmlv_prop}) {
                                         $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->parent()->{$htmlv_prop}) {
                                     $e->parent()->{$htmlv_prop} = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                     default:
                         switch ($lightboxPlusOptions['use_class_method']) {
                             /**
                              * Use Class Method is selected - yes/no
                              */
                             case 1:
                                 if ($e->parent()->class && $e->parent()->class != $lightboxPlusOptions['class_name']) {
                                     $e->parent()->class .= ' ' . $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->rel) {
                                         $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 } else {
                                     $e->parent()->class = $lightboxPlusOptions['class_name'];
                                     if (!$e->parent()->rel) {
                                         $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                     }
                                 }
                                 break;
                             default:
                                 if (!$e->parent()->rel) {
                                     $e->parent()->rel = 'lightbox[' . $postGroupID . $unq_id . ']';
                                 }
                                 break;
                         }
                         break;
                 }
                 /**
                  * Do Not Display Title is select - yes/no
                  */
                 switch ($lightboxPlusOptions['no_display_title']) {
                     case 1:
                         $e->parent()->title = null;
                         break;
                     default:
                         if (!$e->parent()->title) {
                             if ($e->title) {
                                 $e->parent()->title = $e->title;
                             } else {
                                 $e->parent()->title = $postGroupTitle;
                             }
                         }
                         if ($lightboxPlusOptions['use_caption_title']) {
                             //if ($e->parent()->next_sibling()->innertext) { $e->parent()->title = $e->parent()->next_sibling()->innertext; }
                             //if ($e->parent()->next_sibling()->innertext) { $e->title = $e->parent()->next_sibling()->innertext; }
                             if ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->next_sibling()->class = 'wp-caption-text')) {
                                 $e->title = $e->next_sibling()->innertext;
                             } elseif ($e->find('img[src*=jpg$], img[src*=gif$], img[src*=png$], img[src*=jpeg$], img[src*=bmp$]') && ($e->parent()->next_sibling()->class = 'gallery-caption')) {
                                 $e->title = $e->parent()->next_sibling()->innertext;
                             }
                         }
                         break;
                 }
             }
             break;
     }
     $content = $html->save();
     $html->clear();
     unset($html);
     return $content;
 }
Ejemplo n.º 12
0
 public function action_sua($story_id)
 {
     $this->template->title = __('Sửa bài viết: xung - hợp cung');
     $this->template->section_title = __('Sửa bài viết: xung - hợp cung');
     $data = array();
     $story = Model_Horoscope_XungHopBLL::getInstance()->find($story_id);
     if ($story) {
         if (Request::$method == 'POST') {
             //                print_r($_POST);
             //                    die();
             $post = $story->validate_update($_POST);
             if ($post->check()) {
                 //begin save
                 $post_values = $post->as_array();
                 $old_alias = $story->alias;
                 //
                 //alias changed => image changed => directory changed => images in content not get :(
                 $story->hinh_anh = $post_values['hinh_anh'];
                 $story->alias = $post_values['alias'];
                 $story->cung_1 = $post_values['cung_1'];
                 $story->cung_2 = $post_values['cung_2'];
                 $story->tom_tat = trim($post_values['tom_tat']);
                 $story->noi_dung = $post_values['noi_dung'];
                 $story->kiem_tra = true;
                 $story->save();
                 //print_r($img);
                 // Create a DOM object
                 if ($old_alias != $post_values['alias']) {
                     //remove old folder (if existed when update)
                     @rmdir('assets/horoscope/xung-hop/' . $old_alias . '/');
                     require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
                     $html2 = new simple_html_dom();
                     $html2->load($story->noi_dung);
                     $images = $html2->find('img');
                     if ($images) {
                         $i = 1;
                         foreach ($images as $image) {
                             unset($image->onclick);
                             $path = 'assets/horoscope/xung-hop/' . $story->alias . '/';
                             $file_name = 'anh_' . $i;
                             $get_file = Vendor_Crawler::get_file_from_url_by_curl($image->src, $save_to_path = $path, $file_name_to_set = $file_name);
                             if (filesize($get_file) == 0) {
                                 unset($image);
                             } else {
                                 $image->src = '/' . $get_file;
                             }
                             $i++;
                         }
                     }
                     $story->noi_dung = $html2->save();
                     $html2->clear();
                     unset($html2);
                     $story->save();
                 }
                 Request::instance()->redirect('admin/horoscope_xunghop/index');
             } else {
                 $_POST = $post->as_array();
                 #Affects errors for further display
                 $data['errors'] = $post->errors();
             }
         }
         $data['story'] = $story->toArray();
         $this->template->content = View::factory('horoscope/admin/xung-hop/sua', $data);
     } else {
         Request::instance()->redirect('admin/horoscope_xunghop/index');
     }
 }
Ejemplo n.º 13
0
 public function collect()
 {
     $url = trim($this->_post('url'));
     //返回结果
     $res = array('title' => '', 'content' => '');
     //分析网页是否包含视频
     $video = $this->uVideoUpload($url);
     if ($video != '10' && $video != '11') {
         //获取标题
         $htm = file_get_html($url);
         $title = $htm->find('title', 0)->plaintext;
         $htm->clear();
         $res['title'] = $title;
         $res['content'] = $video;
         echo json_encode($res);
         exit;
     }
     //不含视频,则按文章处理
     $collect = D('collect');
     $domin = '';
     $match = "/http:\\/\\/([^\\/]*).*/i";
     if (!substr_count($url, "http")) {
         $url = "http://" . $url;
     }
     preg_match($match, $url, $out);
     $domin = $out[1];
     if (!empty($domin)) {
         //分析是不是音乐网站
         $music_websites = C('MUSIC_WEBSITES');
         if (in_array($domin, $music_websites)) {
             $htm = file_get_html($url);
             $p = preg_match('/var\\s*?_xiamitoken\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out);
             $token = $out[1];
             //onclick="playalbum(682938274, '', '时间的歌', '');
             $xid = '';
             if (preg_match('/playalbum\\((\\d+),\\s*?\'*?\',\\s*?\'(.*?)\',\\s*?\'*?\'\\)/i', $htm, $out)) {
                 //xid
                 $xid = $out[1];
                 //title
                 $title = $out[2];
             } else {
                 if (preg_match('/\\/album\\/(\\d{1,})/', $htm, $out)) {
                     $xid = $out[1];
                     $title = $htm->find('div#title', 0)->plaintext;
                 } else {
                     if (preg_match('/var\\s*?cid\\s*?=\\s*?[\'\\"](.*?)[\'\\"]/i', $htm, $out)) {
                         #var cid = '22454617';
                         $xid = $out[1];
                         $title = $htm->find('title', 0)->plaintext;
                     }
                 }
             }
             if ($xid) {
                 //http://www.xiami.com/ajax/getquote/type/2/id/682938274?_xiamitoken=0802020a13ba3df687e7ca4ef45cf1a8
                 $zurl = "http://www.xiami.com/ajax/getquote/type/2/id/{$xid}?_xiamitoken={$token}";
                 $htm = file_get_html($zurl);
                 $content = $htm->find('textarea.tarea', 1)->innertext;
                 $res['title'] = trim($title);
                 $res['content'] = $content;
                 //清除内存消耗
                 $htm->clear();
             } else {
                 $res['title'] = '';
                 $res['content'] = '没有找到音乐';
             }
             echo json_encode($res);
             exit;
         }
         //查看数据库中是否已经有该域名的记录
         $c = $collect->where('domain="' . $domin . '"')->find();
         if (!$c) {
             //没有数据库记录,则title为页面title,content为body正文
             $collect->data(array('alias' => $domin, 'domain' => $domin, 'match' => '123'))->add();
             //查找body
             $htm = file_get_html($url);
             $title = $htm->find('title', 0)->plaintext;
             $content = $htm->find('body', 0)->innertext;
             //title取正文的10个左右字符
             $res['title'] = $title;
             $res['content'] = $content;
         } else {
             //找到了匹配规则
             //新浪博客URL特殊处理,去掉结尾的 ?tj=...
             if ($domin == 'blog.sina.com.cn') {
                 $url = preg_replace('/\\?tj=.*/i', '', $url);
             }
             $htm = file_get_html($url);
             //获取title
             $matchlist = $this->collect_match->get_matchlist_by_collect_type($c['id'], self::TYPE_TITLE);
             if (!empty($matchlist)) {
                 $exec = '$htm';
                 foreach ($matchlist as $match) {
                     $exec .= "->find( '{$match['match']}', {$match['pos']} )";
                 }
                 $exec = $exec . '->plaintext;';
                 eval("\$str = {$exec};");
                 $res['title'] = $str;
             } else {
                 $title = $htm->find('title', 0)->plaintext;
                 $res['title'] = $title;
             }
             //获取content
             $matchlist = $this->collect_match->get_matchlist_by_collect_type($c['id'], self::TYPE_CONTENT);
             if (!empty($matchlist)) {
                 $exec = '$htm';
                 foreach ($matchlist as $match) {
                     $exec .= "->find( '{$match['match']}', {$match['pos']} )";
                 }
                 $exec = $exec . '->innertext;';
                 eval("\$str = {$exec};");
                 $res['content'] = $str;
             } else {
                 $content = $htm->find('body', 0)->innertext;
                 $res['content'] = $content;
             }
             //某些网站图片特殊处理
             if (in_array($domin, array('history.people.com.cn'))) {
                 $res['content'] = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domin . "\$1" . '"', $res['content']);
             } else {
                 if ($domin == 'blog.sina.com.cn') {
                     //新浪图片,需要把real_src和src属性互相特换
                     // 新建一个Dom实例
                     $new_html = new simple_html_dom();
                     $new_html->load($res['content']);
                     $imgs = $new_html->find('img');
                     foreach ($imgs as &$img) {
                         $img->src = $img->real_src;
                         $img->real_src = null;
                     }
                     $res['content'] = $new_html->innertext;
                     $new_html->clear();
                 }
             }
         }
         //释放内存消耗
         $htm->clear();
     } else {
         $res['title'] = '';
         $res['content'] = '';
         echo json_encode($res);
         exit;
     }
     //获取内容处理html标签
     $res['content'] = $this->clearhtml($res['content']);
     $res['title'] = trim($this->clearhtml($res['title']));
     //转码处理
     $no_need_iconv = C('NO_NEED_ICONV');
     if (!in_array($domin, $no_need_iconv)) {
         if ($domin == 'history.sina.com.cn') {
             //GBK    编码特殊处理
             $res['title'] = iconv("GBK", "UTF-8//IGNORE", $res['title']);
             $res['content'] = iconv("GBK", "UTF-8//IGNORE", $res['content']);
         } else {
             //GB2312 编码处理
             $res['title'] = iconv("GB2312", "UTF-8//IGNORE", $res['title']);
             $res['content'] = iconv("GB2312", "UTF-8//IGNORE", $res['content']);
         }
     }
     echo json_encode($res);
 }
Ejemplo n.º 14
0
    protected function convertImpl($text)
    {
        $descr['book_title'] = $this->nameru;
        $descr['author'] = "";
        foreach ([$this->author, $this->illustrator] as $aut) {
            if ($aut) {
                foreach (explode(',', $aut) as $au) {
                    $a = explode(' ', trim($au));
                    $descr['author'] = (isset($descr['author']) ? $descr['author'] : '') . "<h1>";
                    $descr['author'] .= $this->escapexml(trim($au));
                    $descr['author'] .= "</h1>";
                }
            }
        }
        $descr['annotation'] = '';
        if ($this->annotation) {
            $this->annotation = preg_replace('@\\n@', '</p><p>', $this->annotation);
            $this->annotation = preg_replace("@'''(.*?)'''@", '<b>\\1</b>', $this->annotation);
            $this->annotation = preg_replace("@''(.*?)''@", '<i>\\1</i>', $this->annotation);
            $this->annotation = preg_replace('@<p></p>@', '<br/>', $this->annotation);
            $descr['annotation'] = "<h2>Аннотация</h2><p>{$this->annotation}</p>";
        }
        $descr['coverpage'] = '';
        $images = [];
        if ($this->covers) {
            $innerHeight = $this->height;
            $cover = $this->covers[0];
            $image = $this->images[$cover];
            /* Width and height are unimportant. Actual resizing is done not in this class. We must save aspect ratio though. */
            $descr['coverpage'] = "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />";
            $images[] = $cover;
            $descr['coverpage_n'] = $cover;
        }
        //	echo $descr['coverpage'];
        //		exit;
        if ($this->translators) {
            foreach ($this->translators as $translator) {
                if (!array_key_exists('translator', $descr)) {
                    $descr['translator'] = '';
                }
                $descr['translator'] .= "<p name=\"translator\">" . $this->escapexml($translator) . "</p>";
            }
        }
        if ($this->seriestitle) {
            $descr['sequence'] = "<h1>" . $this->escapexml($this->seriestitle) . ($this->seriesnum ? " {$this->seriesnum}" : '') . " </h1>";
        }
        $descr['date2'] = date('j F Y, H:i', $this->touched);
        $descr['id'] = 'RuRa_' . str_replace('/', '_', $this->nameurl);
        if ($this->isbn) {
            $descr['isbn'] = ";isbn:{$this->isbn}";
        }
        if ($this->command == 'RuRa-team') {
            $credit = "<h2>Реквизиты переводчиков</h2>\n \t\t\t\t         <p>Над переводом работала команда <b>RuRa-team</b></p>\n";
            foreach ($this->workers as $activity => $workers) {
                $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n";
            }
            $credit .= '<p>Самый свежий перевод всегда можно найти на сайте нашего проекта:</p>
				          <p><a href="http://ruranobe.ru">http://ruranobe.ru</a></p>
 				          <p>Чтобы оставаться в курсе всех новостей, вступайте в нашу группу в Контакте:</p>
				          <p><a href="http://vk.com/ru.ranobe">http://vk.com/ru.ranobe</a></p>
						  <p>Для желающих отблагодарить переводчика материально имеются webmoney-кошельки команды:</p>
						  <p><b>R125820793397</b></p>
						  <p><b>U911921912420</b></p>
						  <p><b>Z608138208963</b></p>
						  <p>QIWI-кошелек:</p>
						  <p><b>+79116857099</b></p>
						  <p>Яндекс-деньги:</p>
						  <p><b>410012692832515</b></p>
                          <p>PayPal:</p>
                          <p><b>paypal@ruranobe.ru</b></p>
						  <p>А так же счет для перевода с кредитных карт:</p>
						  <p><b>4890 4941 5384 9302</b></p>
						  <p>Версия от ' . date('d.m.Y', $this->touched) . '</p>
						  <p></p>
						  <p></p>
						  <p></p>
						  <p><b>Любое распространение перевода за пределами нашего сайта запрещено. Если вы скачали файл на другом сайте - вы поддержали воров</b></p>
						  <p></p>
						  <p></p>
						  <p></p>';
        } elseif (strpos($this->command, 'RuRa-team') !== false) {
            $credit = "<h2>Реквизиты переводчиков</h2>\n\t\t\t\t\t\t <p>Над релизом работали {$this->command}</p>\n";
            foreach ($this->workers as $activity => $workers) {
                $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n";
            }
            $credit .= '<p>Самый свежий перевод всегда можно найти на сайте нашего проекта:</p>
						  <p><a l:href="http://ruranobe.ru">http://ruranobe.ru</a></p>
						  <p>Чтобы оставаться в курсе всех новостей, вступайте в нашу группу в Контакте:</p>
						  <p><a l:href="http://vk.com/ru.ranobe">http://vk.com/ru.ranobe</a></p>
						  <p>Версия от ' . date('d.m.Y', $this->touched) . '</p>
						  <p><b>Любое коммерческое использование данного текста или его фрагментов запрещено</b></p>';
        } else {
            $credit = "<h2>Реквизиты переводчиков</h2>";
            if ($this->command) {
                $credit .= "<p>Перевод команды {$this->command}</p>";
            }
            foreach ($this->workers as $activity => $workers) {
                $credit .= '<p>' . $activity . ': <b>' . implode('</b>, <b>', $workers) . "</b></p>\n";
            }
            $credit .= '<p>Версия от ' . date('d.m.Y', $this->touched) . '</p>
						  <p><b>Любое коммерческое использование данного текста или его фрагментов запрещено</b></p>';
        }
        if ($this->height == 0) {
            $text = preg_replace('/(<p[^>]*>)?<img[^>]*>(<\\/p>)?/u', '', $text);
        } else {
            for ($i = 1; $i < count($this->covers); ++$i) {
                $image = $this->images[$this->covers[$i]];
                $text = "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />" . $text;
            }
            $text = preg_replace_callback('/(<a[^>]*>)?<img[^>]*data-resource-id="(-?\\d*)"[^>]*>(<\\/a>)?/u', function ($match) use(&$images) {
                if ($match[2] < 0) {
                    return '';
                }
                $image = $this->images[$match[2]];
                /* Width and height are unimportant. Actual resizing is done not in this class. We must save aspect ratio though. */
                return "<img src=\"" . $image['thumbnail'] . "\" width=\"" . $image['convert_width'] . "\" height=\"" . $image['convert_height'] . "\" />";
            }, $text);
        }
        $footnotes = array();
        $footnotes_temp = explode(',;,', $this->footnotes);
        for ($i = 0; $i < sizeof($footnotes_temp); $i++) {
            if (is_numeric($footnotes_temp[$i])) {
                $footnotes[$footnotes_temp[$i]] = $footnotes_temp[$i + 1];
                $i++;
            }
        }
        $text = trim($text);
        $epubText = "<html>\n\t<body>\n\t\t{$descr['coverpage']}\n\t\t{$descr['author']}\n\t\t{$descr['sequence']}\n\t    {$descr['annotation']}\n\t\t{$credit}\n\t\t{$text}\n\t</body>\n\t</html>";
        $epubText = preg_replace_callback('@(<span[^>]*><a href="#cite_note-(\\d*)"[^>]*>.{0,15}</span>)@', function ($match) use(&$footnotes) {
            $footnote = $footnotes[$match[2]];
            $footnote = preg_replace('@</p>\\s*<p[^>]*>@', '<br/>', $footnote);
            if ($footnote) {
                return '<footnote>' . $footnote . '</footnote>';
            } else {
                return $match[1];
            }
        }, $epubText);
        //preg_replace('@cite_note-(\d*)@',"<footnote></footnote>", $epubText);
        //echo '<xmp>'.$epubText;
        //echo $footnotes[137603266];
        //exit;
        //echo '<xmp>'.$epubText;
        //exit;
        $epubText = preg_replace('@section@', "div", $epubText);
        /* Delete extra <br/> tag before images */
        $epubText = preg_replace('@<div>(.){0,20}<br\\/>(.){0,20}<img src@', '<div><img src', $epubText);
        /* Eliminate caret return before <h1> (Each div starts with caret return in h2d_htmlconverter.php) */
        $epubText = preg_replace('@\\s*<div>(.{0,40})(<h1>.*?<\\/h1>)@', '\\1\\2<div>', $epubText);
        /* NGNL Specific names */
        //$text=str_replace('<span style="position: relative; text-indent: 0;"><span style="display: inline-block; font-style: normal">&#12302;&#12288;&#12288;&#12288;&#12303;</span><span style="position: absolute; font-size: .7em; top: -11px; left: 50%"><span style="position: relative; left: -50%;">','&#12302;<sup>',$text);
        //$text=str_replace('</span></span></span>','</sup>&#12303;',$text);
        // Styles of elements in which footnote is nested should not count. Thus close them
        $epubText = preg_replace('@pb@', "br", $epubText);
        //echo '<xmp>'.$epubText;
        //exit;
        //PHPWord doesn't support tags nested in link element. Unnest images from them
        $epubText = preg_replace('@<a[^>]*>(<img[^>]*>)<\\/a>@', "\\1", $epubText);
        // Delete extra page breaks related to images.
        $epubText = preg_replace('@<div[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/div>@', "\\1\\2\\3", $epubText);
        $epubText = preg_replace('@<p[^>]*>(.){0,20}(<img[^>]*>)(.){0,20}<\\/p>@', "\\1\\2\\3", $epubText);
        /* Swap h2 and img tags if img follows h2. (It gave a bad look in docx). */
        $epubText = preg_replace('@(<h2>.{0,100}<\\/h2>)(<img[^>]*>)@', '\\2\\1', $epubText);
        /* After swap we often needs to further lift img tag in previous <div> or <p> tag */
        $epubText = preg_replace('@<\\/div>(<img[^>]*>)<h2@', '\\1</div><h2', $epubText);
        $epubText = preg_replace('@<\\/p>(<img[^>]*>)<h2@', '\\1</p><h2', $epubText);
        //echo '<xmp>'.$epubText;
        //exit;
        $phpword_object = new \PhpOffice\PhpWord\PhpWord();
        \PhpOffice\PhpWord\Settings::setCompatibility(false);
        $html_dom = new \simple_html_dom();
        $html_dom->load($epubText);
        $html_dom_array = $html_dom->find('html', 0)->children();
        $paths = htmltodocx_paths();
        $initial_state = ['phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => ['size' => '11'], 'parents' => [0 => 'body'], 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => true, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => true, 'treat_div_as_paragraph' => true, 'structure_headings' => true, 'structure_document' => true, 'style_sheet' => htmltodocx_styles_example()];
        htmltodocx_insert_html($phpword_object, $html_dom_array[0]->nodes, $initial_state);
        //var_dump($html_dom_array[0]->nodes);
        //		exit;
        $html_dom->clear();
        unset($html_dom);
        $h2d_file_uri = tempnam(sys_get_temp_dir(), 'htd');
        /*if ($h2d_file_uri === false) {
              var_dump(sys_get_temp_dir());
          }*/
        $objWriter = \PhpOffice\PhpWord\IOFactory::createWriter($phpword_object, 'Word2007');
        $objWriter->save($h2d_file_uri);
        $bin = file_get_contents($h2d_file_uri);
        unlink($h2d_file_uri);
        //echo 'sdfjnsdlkvjn';
        //exit;
        return $bin;
    }
function process_page($html)
{
    $dom = new simple_html_dom();
    $dom->load($html);
    $apps = array();
    global $authority_code;
    global $nearby_api_key;
    foreach ($dom->find("table[class='AppDetailsTable'] tr") as $row) {
        #  Man, this is hacky, but I'm not using dom here in case 'td' shows in plaintext of var
        if (stristr($row, 'FINALISED') || stristr($row, 'CONDITIONAL') || stristr($row, 'APPEALED') || stristr($row, 'WITHDRAWN') || stristr($row, 'NEW<') || stristr($row, 'APPROVED') || stristr($row, 'REFUSED')) {
            $appref = $authority_code . substr($row->children[0]->plaintext, 0, 2) . "/" . substr($row->children[0]->plaintext, 2);
            $rawappref = trim($row->children[0]->plaintext);
            $url = "http://planning.corkcity.ie/InternetEnquiry/rpt_ViewApplicDetails.asp?validFileNum=1&app_num_file=" . $rawappref;
            $rawdate = substr($row->children[4]->plaintext, 0, 10);
            $date = substr($rawdate, -4) . "-" . substr($rawdate, 3, 2) . "-" . substr($rawdate, 0, 2);
            $applicant = trim($row->children[5]->plaintext);
            $address = str_replace("<br>", ",", str_replace("<BR>", ",", $row->children[6]->innertext));
            #print $row;
            #print "row";
            #print $row;
            #print $row->children[15]->innertext;
            #$sizedetails = $row->children[15]->innertext;
            # Now fetch additional information.  Part one, full description of plan
            $fullapphtml = scraperwiki::scrape($url);
            $fullappdom = new simple_html_dom();
            $fullappdom->load($fullapphtml);
            $fullappdetails = $fullappdom->find("table[class='AppDetailsTable'] tr", 15)->children(1)->plaintext;
            #print $fullappdetails;
            unset($fullapphtml);
            $sizehtml = scraperwiki::scrape($url);
            $sizedom = new simple_html_dom();
            $sizedom->load($sizehtml);
            $signifdetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(1)->plaintext;
            $sizedetail = $sizedom->find("table[class='AppDetailsTable'] tr", 23)->children(4)->plaintext;
            unset($sizehtml);
            if (strpos($fullappdetails, "Protected Structure") !== false) {
                $protected = "Protected Structure";
            } else {
                $protected = "";
            }
            #print $protected;
            $spam_found = false;
            # mobile,council,gov etc
            $business = array("retail", "Hotel", "Ltd", "Limited", " shop", " shop");
            foreach ($business as $businessword) {
                if (strrpos($row, $businessword)) {
                    $spam_found = true;
                    break;
                }
            }
            if ($spam_found) {
                $category = "Business";
            } else {
                if (strpos($fullappdetails, "dwelling") !== false) {
                    $category = "residential";
                } else {
                    $category = "";
                }
            }
            #print $category;
            # Part two, location of application
            $lochtml = scraperwiki::scrape('http://planning.corkcity.ie/InternetEnquiry/rpt_ViewSiteLocDetails.asp?page_num=0&file_number=' . $rawappref);
            if (!stristr($lochtml, "No Site Location Details Found")) {
                $locdom = new simple_html_dom();
                $locdom->load($lochtml);
                $locnorthing = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(1)->plaintext));
                $loceasting = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(4)->plaintext));
                # Part three, convert E&N to WGS84 using geograph class
                $c = new ConversionsLatLong();
                $res = $c->irish_to_wgs84($loceasting, $locnorthing);
                $lat = $res[0];
                $long = $res[1];
                $locdom->clear();
                unset($locdom);
                unset($lochtml);
                $apps["{$appref}"] = array('url' => $url, 'appref' => $appref, 'date' => $date, 'applicant' => $applicant, 'address' => $address, 'details' => $fullappdetails, 'signif' => $signifdetail, 'size' => $sizedetail, 'category' => $category, 'protected' => $protected, 'latitude' => $lat, 'longitude' => $long);
            }
        }
    }
    $dom->clear();
    unset($dom);
    return $apps;
}
Ejemplo n.º 16
0
for ($i = 0; $i < 3; ++$i) {
    $dom = file_get_dom($filename);
    //stat_dom($dom);
    $dom->clear();
    unset($dom);
    dump_memory();
    flush();
}
echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>';
flush();
echo '<br><br>[one object]<br>init memory: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>';
echo '------------------------------------------<br>';
flush();
$dom = new simple_html_dom();
for ($i = 0; $i < 3; ++$i) {
    $dom->load_file($filename);
    $dom->clear();
    dump_memory();
}
unset($dom);
echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>';
flush();
echo '<br><br>[multi objects without clear memory]<br>init memory: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>';
echo '------------------------------------------<br>';
flush();
for ($i = 0; $i < 3; ++$i) {
    $dom = file_get_dom($filename);
    dump_memory();
}
echo 'final: ' . number_format(memory_get_usage(), 0, '.', ',') . '<br>';
flush();
Ejemplo n.º 17
0
 /**
  * 解析下返回来的信息
  * @return string 解析成功后的信息
  */
 public function parse()
 {
     require_once dirname(__FILE__) . '/simple_html_dom.php';
     $data = $this->requestURL();
     if (empty($data) || strlen($data < 100)) {
         return $data;
     }
     //如过抓取到的内容是空的说明cookie失效了。
     $html = new simple_html_dom();
     $html->load($data);
     $ymd = $html->find('.time-d');
     $his = $html->find('.time-h');
     $title = $html->find('.consume-title a');
     $trade = $html->find('td.tradeNo p');
     $name = $html->find('p.name');
     $amount = $html->find('td.amount span');
     if (!$trade) {
         return 'no_order';
     }
     $info = array();
     foreach ($ymd as $key => $value) {
         //只要订单数字部分
         preg_match('/\\d+/', $trade[$key]->innertext, $tradeNo);
         //这里可以添加一些逻辑判断语句,例如存到数据库里面遍历查询这个订单是否已经通知成功
         $info[] = array('time' => trim($ymd[$key]->innertext) . ' ' . trim($his[$key]->innertext), 'title' => trim($title[$key]->innertext), 'trade' => trim($tradeNo[0]), 'name' => trim($name[$key]->innertext), 'amount' => trim(str_replace('+', '', $amount[$key]->innertext)));
     }
     $html->clear();
     return $info;
 }
Ejemplo n.º 18
0
 function getDetailArticle($arrUrl, $arrClass, $arrImgExpert, $folder_name, $arrPregReplace, $arrStrReplace, $domain = '', $classMore = '')
 {
     $detailArr = array();
     $url = $arrUrl['url'];
     if ($url) {
         $ch = curl_init();
         curl_setopt($ch, CURLOPT_URL, $url);
         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
         $result = curl_exec($ch);
         curl_close($ch);
         // Create a DOM object
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($result);
         $detailArr = array();
         $detailArr['url'] = $url;
         if ($html->find($arrClass['title'], 0)) {
             $detailArr['title'] = preg_replace('#<span (.*?)</span>#', ' ', $html->find($arrClass['title'], 0)->innertext());
         } else {
             return $detailArr;
         }
         $detailArr['title'] = strip_tags($detailArr['title']);
         $content = $html->find($arrClass['content'], 0);
         if ($content != NULL) {
             foreach ($content->find('img') as $img) {
                 $remove = strstr($img->src, '?');
                 $img->src = str_replace($remove, "", $img->src);
                 $tenfile = basename($img->src);
                 $arrPartImage = explode('.', $tenfile);
                 // Get image extension
                 $imgExt = array_pop($arrPartImage);
                 // Get image not extension
                 $imgs = preg_replace('/(.*)(_\\d+x\\d+)/', '$1', implode('.', $arrPartImage));
                 $imgs = $this->changeTitle($imgs);
                 $name = "{$imgs}.{$imgExt}";
                 if (!is_dir("../uploads/" . date('Y/m/d') . "/" . $folder_name . "/")) {
                     mkdir("../uploads/" . date('Y/m/d') . "/" . $folder_name . "/", 0777, true);
                 }
                 $pathfile = "../uploads/" . date('Y/m/d') . "/" . $folder_name . "/" . $tenfile;
                 $img->src = $pathfile;
                 if ($urlHinh == '') {
                     $urlHinh = $img->src;
                 }
                 $img->class = "aligncenter";
             }
         }
         if ($content) {
             $contentHtml = $content->innertext();
         }
         if (!empty($arrPregReplace)) {
             foreach ($arrPregReplace as $preg) {
                 $contentHtml = preg_replace($preg, ' ', $contentHtml);
             }
         }
         if (!empty($arrStrReplace)) {
             foreach ($arrStrReplace as $strre) {
                 $contentHtml = str_replace($strre, ' ', $contentHtml);
             }
         }
         $detailArr['content'] = $contentHtml;
         if ($arrClass['description'] == '') {
             $tmp = strip_tags($contentHtml);
             $detailArr['description'] = $this->string_limit($tmp, 255);
         } else {
             $detailArr['description'] = $html->find($arrClass['description'], 0)->innertext();
         }
         if ($arrUrl['thumbnailUrl'] == '') {
             $detailArr['thumbnailUrl'] = $urlHinh;
         } else {
             $detailArr['thumbnailUrl'] = $arrUrl['thumbnailUrl'];
         }
         $html->clear();
         unset($html);
     }
     return $detailArr;
 }
Ejemplo n.º 19
0
if (isset($_GET['zkzh'])) {
    $src = 'http://www.chsi.com.cn/cet/query';
    $id = $_GET['zkzh'];
    $name = $_GET['xm'];
    $ch = curl_init();
    curl_setopt_array($ch, array(CURLOPT_URL => $src . '?zkzh=' . $id . '&xm=' . $name, CURLOPT_RETURNTRANSFER => true, CURLOPT_POST => false, CURLOPT_REFERER => 'http://www.chsi.com.cn/cet/'));
    $content = curl_exec($ch);
    if (curl_errno($ch) == 0) {
        $html = new simple_html_dom();
        $html->load($content);
        $table = $html->find('table[class=cetTable]', 0);
        if (!$table) {
            $str = "请确认姓名或准考证号是否正确!";
        } else {
            $text = $table->outertext;
            $html->clear();
            $table->clear();
            unset($html);
            $str = $text;
            $str = str_replace('<table border="0" align="center" cellpadding="0" cellspacing="6" class="cetTable">  	<tr>  		<th>', "", $str);
            $str = str_replace('</th>  		<td>', "", $str);
            $str = str_replace('</td>  	</tr>  	<tr>  		<th>', "\n", $str);
            $str = str_replace('<strong><span style="color: #F00;">', "", $str);
            $str = str_replace('</span>', "", $str);
            $str = str_replace('&nbsp;&nbsp;', "", $str);
            $str = str_replace('<span class="color01">', "\n", $str);
            $str = str_replace('</strong></td>  	</tr>  </table>', "\n\n查询数据来源于学信网\nOURStudio提供技术支持.", $str);
        }
        echo $str;
    } else {
        echo curl_error($ch);
Ejemplo n.º 20
0
 private function process_special_content($url, $domain, $content)
 {
     //某些网站图片路径补全处理
     if ($domain == 'history.people.com.cn') {
         $content = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domain . "\$1" . '"', $content);
     } else {
         if ($domain == 'blog.sina.com.cn') {
             //新浪图片,需要把real_src和src属性互相特换
             $new_html = new simple_html_dom();
             $new_html->load($content);
             $imgs = $new_html->find('img');
             foreach ($imgs as &$img) {
                 $img->src = $img->real_src;
                 $img->real_src = null;
             }
             $content = $new_html->innertext;
             $new_html->clear();
         } else {
             if ($domain == 'www.nowamagic.net') {
                 $base = 'http://www.nowamagic.net/librarys/';
                 $content = preg_replace("/src=\"\\.\\.\\/\\.\\.\\/(.*?)\"/i", 'src="' . $base . "\$1" . '"', $content);
             } else {
                 if ($domain == 'www.jfdaily.com') {
                     $content = preg_replace("/src=\"(.*?)\"/i", 'src="http://' . $domain . "\$1" . '"', $content);
                 }
             }
         }
     }
     return $content;
 }
Ejemplo n.º 21
0
                                }
                            }
                            $subcat2->clear();
                            unset($subcat2);
                        }
                    }
                    $subcat1->clear();
                    unset($subcat1);
                }
            }
            $category->clear();
            unset($category);
        }
    }
}
$cat_url->clear();
unset($cat_url);
echo "<p>Всего спарсили url " . $cat_counter . " категорий </p>";
fclose($cat_handle);
/* if (($handle = fopen("price.csv", "r")) !== FALSE) {
    while (($data = fgetcsv($handle, 1000, ";")) !== FALSE) {
       
        $articuls[] = $data[2];		 
    }
	foreach ($articuls as $articul){
	
	
    }
	fclose($handle);
}
 */
Ejemplo n.º 22
0
 public function fetchDataFromPriceGrabberApi($upc)
 {
     ini_set('max_execution_time', 999999);
     ini_set('memory_limit', '400M');
     require_once dirname(BASEPATH) . '/system/application/libraries/simple_html_dom.php';
     $debug = false;
     $url = 'http://www.pricegrabber.com/search_request.php?form_keyword=' . $upc . '&some_id=&id_type=&requestParams=Tjs%3D&vendorIds=YTowOnt9&st=query&sv=findit_top&kw_suggest=0&topcat_menu=6&zip_code=54001';
     if ($debug) {
         echo '<br />' . $url . '<br />';
     }
     $html = new simple_html_dom();
     $html->load_file($url);
     if ($html->find('div[class=product_description]', 0)) {
         $name = $html->find('div[class=product_description]', 0)->find('h1', 0)->plaintext;
     } else {
         $html->clear();
         return array();
     }
     $desc = '';
     if ($html->find('p[id=product_details_description]', 0)) {
         $desc = $html->find('p[id=product_details_description]', 0)->plaintext;
     }
     $product_image = '';
     if ($html->find('div[class=product_img]', 0)) {
         $product_image = $html->find('div[class=product_img]', 0)->find('img', 0)->getAttribute('src');
     }
     $items = array();
     if ($html->find('table[class=pricing_tbl]', 0)) {
         $price_table = $html->find('table[class=pricing_tbl]', 0);
     } else {
         $html->clear();
         return array();
     }
     if ($debug) {
         echo $price_table;
     }
     foreach ($price_table->find('tr') as $tr) {
         $mystring = $tr->getAttribute('class');
         $findme = 'section';
         $pos = strpos($mystring, $findme);
         if ($pos !== false) {
             continue;
         }
         if ('noseller' == $tr->find('td', 0)->getAttribute('class')) {
             break;
         }
         $bottom_price = $this->getPriceForPriceGrabber($tr->find('td', 1)->find('div[class=deftip]', 0)->plaintext);
         $price = $this->getPriceForPriceGrabber($tr->find('td', 2)->plaintext);
         $shopping_price = $bottom_price - $price;
         $seller_link = $tr->find('td', 4)->find('a', 0);
         $href = $seller_link->getAttribute('href');
         $img = $seller_link->find('img', 0);
         if ($img) {
             $menu = $img->getAttribute('alt');
             $logo = $img->getAttribute('src');
         } else {
             $menu = $seller_link->plaintext;
             $logo = '';
         }
         $data = array('name' => $name, 'desc' => $desc, 'manu' => $menu, 'url' => $href, 'productImage' => $product_image, 'basePrice' => trim($price), 'shippingPrice' => trim($shopping_price), 'price' => trim($price), 'logo' => $logo);
         $items[] = $data;
         if ($debug) {
             echo '<br />';
             echo '--------------------------';
             echo '<br />';
             echo '<pre>';
             print_r($data);
             echo '</pre>';
             echo '<br />';
             echo 'Name: ' . $name;
             echo '<br />';
             echo 'Desc: ' . $desc;
             echo '<br />';
             echo 'Bottom Price: ' . $bottom_price;
             echo '<br />';
             echo 'Price: ' . $price;
             echo '<br />';
             echo 'Shipping Price: ' . $shopping_price;
             echo '<br />';
             echo 'Seller Name: ' . $menu;
             echo '<br />';
             echo 'Seller Image: ' . $logo;
             echo '<br />';
             echo 'Store Link: ' . $href;
             echo '<br />';
             echo '--------------------------';
             echo '<br />';
         }
     }
     $html->clear();
     return $items;
 }
Ejemplo n.º 23
0
 /**
  *
  * Remove from memory
  *
  */
 public function __destruct()
 {
     $this->_html->clear();
     $this->_editHtml->clear();
     unset($this->_response);
 }
Ejemplo n.º 24
0
 public function getXml()
 {
     $openid = trim($_REQUEST['openid']);
     $wxid = trim($_REQUEST['wxid']);
     if (!$openid) {
         echo "no openid";
         exit;
     }
     $url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" . $openid;
     $json = file_get_html($url);
     $json = stripslashes($json);
     preg_match('/\\"totalItems\\"\\:(\\d+)/', $json, $matches);
     $itemCount = $matches[1];
     preg_match('/\\"totalPages\\"\\:(\\d+)/', $json, $matches);
     $pageCount = $matches[1];
     if (intval($itemCount) == 0) {
         echo "该公众号没有发布文章";
         exit;
     }
     $tmp = array();
     $k = 0;
     for ($j = 0; $j < 2; $j++) {
         if ($j == 0) {
             preg_match_all("/<url>(.*?)<\\/url>/i", $json, $links, PREG_PATTERN_ORDER);
             preg_match_all("/<content>(.*?)<\\/content>/i", $json, $contents, PREG_PATTERN_ORDER);
             for ($i = 0; $i < count($links[1]); $i++) {
                 $url = ltrim($links[1][$i], '<![CDATA[');
                 $url = rtrim($url, ']]>');
                 $content = ltrim($contents[1][$i], '<![CDATA[');
                 $content = rtrim($content, ']]>');
                 $tmp[$k]['url'] = $url;
                 $tmp[$k]['content'] = $content;
                 $k++;
             }
         } else {
             $url = "http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=" . $openid . '&page=' . ($j + 1);
             $json = file_get_html($url);
             $json = stripslashes($json);
             preg_match_all("/<url>(.*?)<\\/url>/i", $json, $links, PREG_PATTERN_ORDER);
             preg_match_all("/<content>(.*?)<\\/content>/i", $json, $contents, PREG_PATTERN_ORDER);
             for ($i = 0; $i < count($links[1]); $i++) {
                 $url = ltrim($links[1][$i], '<![CDATA[');
                 $url = rtrim($url, ']]>');
                 $content = ltrim($contents[1][$i], '<![CDATA[');
                 $content = rtrim($content, ']]>');
                 $tmp[$k]['url'] = $url;
                 $tmp[$k]['content'] = $content;
                 $k++;
             }
         }
     }
     $mh = curl_multi_init();
     for ($m = 0; $m < count($tmp); $m++) {
         $conn[$i] = curl_init($tmp[$m]['url']);
         curl_setopt($conn[$i], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)");
         curl_setopt($conn[$i], CURLOPT_HEADER, 0);
         curl_setopt($conn[$i], CURLOPT_CONNECTTIMEOUT, 60);
         curl_setopt($conn[$i], CURLOPT_RETURNTRANSFER, true);
         // 设置不将爬取代码写到浏览器,而是转化为字符串
         curl_multi_add_handle($mh, $conn[$i]);
     }
     do {
         curl_multi_exec($mh, $active);
     } while ($active);
     for ($m = 0; $m < count($tmp); $m++) {
         $data = curl_multi_getcontent($conn[$m]);
         // 获得爬取的代码字符串
         $a = new simple_html_dom();
         $a->load($data);
         $title = $a->find('h1#activity-name', 0)->outertext;
         $content = $a->find('div#page-content', 0)->outertext;
         $article = array('title' => $title, 'content' => $content);
         $a->clear();
         $data = array('wxh' => $wxid, 'openId' => $openid, 'link' => $tmp[$m]['url'], 'title' => $article['title'], 'summary' => $tmp[$m]['content'], 'content' => $article['content']);
         $this->weixin->_create($data);
         echo "save success" . $m . '<br>';
     }
     for ($m = 0; $m < count($tmp); $m++) {
         curl_multi_remove_handle($mh, $conn[$m]);
         curl_close($conn[$i]);
     }
     curl_multi_close($mh);
     unset($tmp);
 }
Ejemplo n.º 25
0
$html_dom->load('<html><body>' . $html . '</body></html>');
// Note, we needed to nest the html in a couple of dummy elements.
// Create the dom array of elements which we are going to work on:
$html_dom_array = $html_dom->find('html', 0)->children();
// We need this for setting base_root and base_path in the initial_state array
// (below). We are using a function here (derived from Drupal) to create these
// paths automatically - you may want to do something different in your
// implementation. This function is in the included file
// documentation/support_functions.inc.
$paths = htmltodocx_paths();
// Provide some initial settings:
$initial_state = array('phpword_object' => &$phpword_object, 'base_root' => $paths['base_root'], 'base_path' => $paths['base_path'], 'current_style' => array('size' => '11'), 'parents' => array(0 => 'body'), 'list_depth' => 0, 'context' => 'section', 'pseudo_list' => TRUE, 'pseudo_list_indicator_font_name' => 'Wingdings', 'pseudo_list_indicator_font_size' => '7', 'pseudo_list_indicator_character' => 'l ', 'table_allowed' => TRUE, 'treat_div_as_paragraph' => TRUE, 'style_sheet' => htmltodocx_styles_example());
// Convert the HTML and put it into the PHPWord object
htmltodocx_insert_html($section, $html_dom_array[0]->nodes, $initial_state);
// Clear the HTML dom object:
$html_dom->clear();
unset($html_dom);
// Save File
$h2d_file_uri = tempnam('', 'htd');
$objWriter = PHPWord_IOFactory::createWriter($phpword_object, 'Word2007');
$objWriter->save($h2d_file_uri);
// Download the file:
header('Content-Description: File Transfer');
header('Content-Type: application/octet-stream');
header('Content-Disposition: attachment; filename=example.docx');
header('Content-Transfer-Encoding: binary');
header('Expires: 0');
header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
header('Pragma: public');
header('Content-Length: ' . filesize($h2d_file_uri));
ob_clean();
function process_page($html)
{
    $dom = new simple_html_dom();
    $dom->load($html);
    $apps = array();
    global $authority_code;
    global $nearby_api_key;
    foreach ($dom->find("table[class='AppDetailsTable'] tr") as $row) {
        #  Man, this is hacky, but I'm not using dom here in case 'td' shows in plaintext of var
        if (stristr($row, 'FINALISED') || stristr($row, 'CONDITIONAL') || stristr($row, 'APPEALED') || stristr($row, 'WITHDRAWN') || stristr($row, 'NEW<') || stristr($row, 'APPROVED') || stristr($row, 'REFUSED')) {
            $appref = $authority_code . substr($row->children[0]->plaintext, 0, 2) . "/" . substr($row->children[0]->plaintext, 2);
            $rawappref = trim($row->children[0]->plaintext);
            $url = "http://planning.corkcity.ie/InternetEnquiry/rpt_ViewApplicDetails.asp?validFileNum=1&app_num_file=" . $rawappref;
            $rawdate = substr($row->children[4]->plaintext, 0, 10);
            $date = substr($rawdate, -4) . "-" . substr($rawdate, 3, 2) . "-" . substr($rawdate, 0, 2);
            $applicant = trim($row->children[5]->plaintext);
            $address = str_replace("<br>", ",", str_replace("<BR>", ",", $row->children[6]->innertext));
            # Now fetch additional information.  Part one, full description of plan
            $fullapphtml = scraperwiki::scrape($url);
            $fullappdom = new simple_html_dom();
            $fullappdom->load($fullapphtml);
            $fullappdetails = $fullappdom->find("table[class='AppDetailsTable'] tr", 15)->children(1)->plaintext;
            unset($fullapphtml);
            # Part two, location of application
            $lochtml = scraperwiki::scrape('http://planning.corkcity.ie/InternetEnquiry/rpt_ViewSiteLocDetails.asp?page_num=0&file_number=' . $rawappref);
            if (!stristr($lochtml, "No Site Location Details Found")) {
                $locdom = new simple_html_dom();
                $locdom->load($lochtml);
                $locnorthing = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(1)->plaintext));
                $loceasting = round(floatval($locdom->find("table[class='AppDetailsTable'] tr", 1)->children(4)->plaintext));
                # Part three, convert E&N to WGS84 using geograph class
                $c = new ConversionsLatLong();
                $res = $c->irish_to_wgs84($loceasting, $locnorthing);
                $lat = $res[0];
                $long = $res[1];
                $locdom->clear();
                unset($locdom);
                unset($lochtml);
                $apps["{$appref}"] = array('url' => $url, 'appref' => $appref, 'date' => $date, 'applicant' => $applicant, 'address' => $address, 'details' => $fullappdetails, 'latitude' => $lat, 'longitude' => $long);
            }
        }
    }
    $dom->clear();
    unset($dom);
    return $apps;
}
Ejemplo n.º 27
0
 $idCidPoli = str_replace('\'', '', $idCidPoli);
 $idCidPoli = str_replace(');', '', $idCidPoli);
 //guarda a posição do espaço que separa o id do prefeito ou vereador do id do municipio
 $espaco = strripos($idCidPoli, ' ');
 //guarda o id do prefeito ou do vereador que é 11 ou 13
 $codigoCargo = substr($idCidPoli, 0, $espaco);
 //guarda o id da cidade
 $codigoMunicipio = substr($idCidPoli, $espaco + 1);
 //modifica a url do ajax que é exibida na tela
 $urlAjaxPrefeitoVereador = "http://divulgacand2012.tse.jus.br/divulgacand2012/pesquisarCandidato.action?siglaUFSelecionada=" . $siglaUF . "&codigoMunicipio=" . $codigoMunicipio . "&codigoCargo=" . $codigoCargo . "&codigoSituacao=0";
 $htmlCidade = new simple_html_dom();
 //carrega o html que possui todos prefeitos ou vereadores da cidade
 $htmlCidade->load_file($urlAjaxPrefeitoVereador);
 //pega os input com o id e a ultima atualização do politico
 $candidato = $htmlCidade->find('tr[class="odd gradeX"] input');
 $htmlCidade->clear();
 unset($htmlCidade);
 //array para guardar ids dos candidatos e ids da ultima atualização do candidato
 $array = array("sqCandidato", "dtUltimaAtualizacao");
 $i = 0;
 $j = 0;
 foreach ($candidato as $elemento) {
     if (strcmp($elemento->name, "sqCandidato") == 0) {
         $array["sqCandidato"][$i] = $elemento->value;
         $i++;
     } else {
         $array["dtUltimaAtualizacao"][$j] = $elemento->value;
         $j++;
     }
 }
 if ($i != $j) {
Ejemplo n.º 28
0
function job51($username, $password)
{
    //配置URl  登录信息
    $login = "******";
    $post_file = "username="******"&userpwd=" . $password;
    $cookie_file = tempnam('./temp', 'cookie');
    //初始化curl
    $ch = curl_init($login);
    /////初始化一个CURL对象
    curl_setopt($ch, CURLOPT_HEADER, 0);
    ///设置不输出在浏览器上
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_POST, 1);
    ////传递一个作为HTTP "POST"操作的所有数据的字符串
    curl_setopt($ch, CURLOPT_POSTFIELDS, $post_file);
    /////把返回来的cookie信息保存在$cookie_jar文件中
    curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
    $loginInfo = curl_exec($ch);
    ///执行
    curl_close($ch);
    ////关闭
    //现在已经是登录状态
    $domain = "http://www.51job.com";
    $url = "http://my.zhaopin.com/myzhaopin/resume_list.asp";
    //http://my.51job.com/sc/applyjob/preview_resume.php?ReSumeID=330146538&AccountID=100707500
    $ch = curl_init($domain);
    //防止重定向
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4");
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    ///设置不输出在浏览器上
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
    return "success";
    $contents = curl_exec($ch);
    $html = new simple_html_dom();
    $html->load($contents);
    $resumePage = $html->find('.orange');
    if (isset($resume)) {
        return "登录失败";
    } else {
        return $resumePage;
    }
    $resumeLink = $resumePage[0]->href;
    $html->clear();
    curl_close($ch);
    ////关闭
    $ch = curl_init($resumeLink);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4");
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    ///设置不输出在浏览器上
    curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
    $contents = curl_exec($ch);
    //icon18 iconSee
    $html->load($contents);
    $resume = $html->find('a[onfocus]');
    $url = $resume[22]->href;
    $html->clear();
    curl_close($ch);
    ////关闭
    $ch = curl_init($url);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/4");
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    ///设置不输出在浏览器上
    curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
    $contents = curl_exec($ch);
    //icon18 iconSee
    return $contents;
}
function create_dataset2($html)
{
    $i = 0;
    $dom = new simple_html_dom();
    $dom->load($html);
    #controllo se esiste veramente prima di entrare
    $table = $dom->find('table', 2);
    if (isset($table)) {
        foreach ($dom->find('table', 2)->children() as $data) {
            echo "parsing info tabella principale";
            if ($data != null) {
                $res = trim($data->plaintext);
            }
            if ($i > 0 && strlen($res) > 0) {
                # Store data in the datastore
                #print $res;
                $res = str_replace('&#39;', "'", $res);
                #splitto i risultati in un array
                $array_result = split('&nbsp;', $res);
                #print_r($res);
                #echo $denom;
                # Mi salvo il codiceMPI
                $codMPI = trim($array_result[1]);
                $url_MPI = "http://www.trampi.istruzione.it/ricScu/dettaglio.do?cod=" . $codMPI;
                #print $url_MPI."\n";
                $html = scraperwiki::scrape($url_MPI);
                $dom_mpi = new simple_html_dom();
                $dom_mpi->load($html);
                $tel = "";
                $fax = "";
                $email = "";
                $web = "";
                $indS = "";
                $tr = $dom_mpi->find('table[cellspacing=1] tr');
                if (isset($tr)) {
                    foreach ($dom_mpi->find('table[cellspacing=1] tr') as $data_mpi) {
                        $res = $data_mpi->plaintext . "\n";
                        $values = split(':', $res);
                        #print_r($values);
                        if (strlen($values[0]) > 0) {
                            if (stripos($values[0], 'tel') !== false) {
                                $tel = trim($values[1]);
                                #print "tel:".$tel."\t";
                            } else {
                                if (stripos($values[0], 'fax') !== false) {
                                    $fax = trim($values[1]);
                                    #print "fax:".$fax."\t";
                                } else {
                                    if (stripos($values[0], 'e-mail') !== false) {
                                        $email = trim($values[1]);
                                    } else {
                                        if (stripos($values[0], 'web') !== false) {
                                            while (list($key, $value) = each($values)) {
                                                if ($key = 2) {
                                                    $web = $values[1] . ":" . $value;
                                                }
                                            }
                                        } else {
                                            if (stripos($values[0], 'studio') !== false) {
                                                $indS = str_replace('</td>', '', $values[1]);
                                                $indS = str_replace('</tr>', '', $indS);
                                                $indS = str_replace(array("\r", "\t", "\n"), '', $indS);
                                                $indS = trim($indS);
                                                #print "ind studio:".$indS."\n";
                                            }
                                        }
                                    }
                                }
                            }
                            #echo $web."\n";
                        }
                    }
                    unset($values);
                }
                $dom_mpi->clear();
                unset($dom_mpi);
                $dataset = array('denominazione' => trim(html_entity_decode($array_result[0])), 'codiceMPI' => trim($array_result[1]), 'tipologia' => trim(html_entity_decode($array_result[2])), 'tipologiaIIgrado' => trim(html_entity_decode($array_result[3])), 'descrizione' => trim(html_entity_decode($array_result[4])), 'indirizzo' => trim(html_entity_decode($array_result[5])), 'località' => trim(html_entity_decode($array_result[6])), 'cap' => trim($array_result[7]), 'comune' => trim(html_entity_decode($array_result[8])), 'provincia' => trim(html_entity_decode($array_result[9])), 'regione' => trim(html_entity_decode($array_result[10])), 'codIstitutoComprensivo' => trim(html_entity_decode($array_result[11])), 'telefono' => $tel, 'fax' => $fax, 'email' => $email, 'web' => $web, 'IndirizziStudio' => trim(html_entity_decode($indS)));
                #print_r($dataset);
                #scraperwiki::save(array('data'), array('data' => $data->plaintext));
                if (strlen($dataset['denominazione']) > 1) {
                    scraperwiki::save(array('denominazione', 'codiceMPI'), $dataset);
                }
                unset($dataset);
                unset($res);
                unset($tel);
                unset($fax);
                unset($email);
                unset($web);
                unset($indS);
            }
            $i = $i + 1;
        }
        #dealloco il dom sennò schianta
        $dom->clear();
        unset($dom);
    }
}
Ejemplo n.º 30
0
 public static function str_get_html($str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT)
 {
     $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $defaultBRText);
     if (empty($str)) {
         $dom->clear();
         return false;
     }
     $dom->load($str, $lowercase, $stripRN);
     return $dom;
 }