示例#1
0
 /**
  * Craw content from start url to the last url which got
  * @param string $start_uri: The main start url to check
  * @param string $md5sum_to_check: The last md5 checksum of url which is have been got
  *
  */
 public static function _Do_Craw_Content($start_uri, $md5sum_to_check)
 {
     self::_print_to_console('Begin fetch URI:<br/>' . $start_uri);
     $can_next = true;
     $next_link = '';
     $backet = array();
     $content = Vendor_Crawler::get_content_from_uri_by_curl($start_uri);
     if ($content) {
         self::_print_to_console('Get content done!');
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($content['output']);
         //begin find content
         $container = $html->find('div[id="TopicList"]', 0);
         if ($container) {
             //begin get story
             $stories = $container->find('div[class="cont09"]');
             if (count($stories) > 0) {
                 //begin parse each story
                 $index = 0;
                 foreach ($stories as $story) {
                     if ($can_next) {
                         # find title,image,summary => send to $backet
                         $ok = true;
                         $str = array();
                         $img = $story->find('img', 0);
                         if ($img) {
                             $str['image'] = $img->src;
                         } else {
                             $ok = false;
                         }
                         unset($img);
                         $lnk = $story->find('a[class="lnk02"]', 0);
                         if ($lnk) {
                             $str['md5sum'] = md5($lnk->href);
                             if ($str['md5sum'] != $md5sum_to_check) {
                                 $str['link'] = $lnk->href;
                                 $str['title'] = trim($lnk->plaintext);
                                 # $str['cung_1'] = $matches[];
                                 $str['slug'] = Vendor_Crawler::toAscii($str['title']);
                                 $pattern = '/([a-z]+[-]+[a-z]+)[-]va[-]([a-z]+[-]+[a-z]+)[-].*/ismUx';
                                 preg_match($pattern, $str['slug'], $matches);
                                 if (isset($matches[1]) && isset($matches[2])) {
                                     $str['cung_1'] = $matches[1];
                                     $str['cung_2'] = $matches[2];
                                 } else {
                                     $pattern = '/hai[-]nguoi[-]cung[-]+([a-z]+[-]+[a-z]+)[-].*/ismUx';
                                     preg_match($pattern, $str['slug'], $matches);
                                     if (isset($matches[1])) {
                                         $str['cung_1'] = $str['cung_2'] = $matches[1];
                                     } else {
                                         $str['cung_1'] = $str['cung_2'] = '-';
                                     }
                                 }
                             } else {
                                 $can_next = false;
                             }
                             # $str['cung_1'] = $matches;
                         } else {
                             $ok = false;
                         }
                         unset($lnk);
                         if ($can_next) {
                             $summary = $story->find('p[class="cont09txt"]', 0);
                             if ($summary) {
                                 $str['summary'] = trim($summary->plaintext);
                             } else {
                                 $ok = false;
                             }
                         }
                         if ($ok && $can_next) {
                             $str['date_check'] = date("Y-m-d h:i:s");
                             $backet[$index] = $str;
                             unset($str);
                             $index++;
                         }
                     } else {
                         return;
                     }
                 }
                 //EOF foreach
                 # get next page link
                 if ($can_next) {
                     $tmp = explode('?p=', $start_uri);
                     if (isset($tmp[1])) {
                         $next_link = $tmp[0] . "?p=" . (intval($tmp[1]) + 1);
                     } else {
                         # next page is the second
                         $next_link = $tmp[0] . "?p=2";
                     }
                     //echo $next_link;
                 }
                 //var_dump($backet);
                 //insert backet into db
                 if (count($backet > 0)) {
                     $succ = 0;
                     $failure = 0;
                     $dupp = 0;
                     foreach ($backet as $bag) {
                         if (Model_Horoscope_XungHopLinkBLL::CheckRecordByMd5Sum($bag['md5sum'])) {
                             $dupp++;
                         } else {
                             $item = new Model_Horoscope_XungHopLink();
                             $item->uri = $bag['link'];
                             $item->md5sum = $bag['md5sum'];
                             $item->title = $bag['title'];
                             $item->hinh_anh = $bag['image'];
                             $item->ngay_check = $bag['date_check'];
                             $item->summary = $bag['summary'];
                             $item->cung_1 = $bag['cung_1'];
                             $item->cung_2 = $bag['cung_2'];
                             $item->hoan_thanh = false;
                             $item->save();
                             if ($item->identifier()) {
                                 $succ++;
                             } else {
                                 $failure++;
                             }
                         }
                     }
                 }
                 self::_print_to_console('--> INSERT: ' . $succ . ' (ok) , ' . $failure . ' (false) , ' . $dupp . ' (duplicate) <-- <br/>');
                 flush();
             } else {
                 self::_print_to_console('--> No story found!');
                 $can_next = false;
             }
         } else {
             self::_print_to_console('Cant get main container!');
         }
         $html->clear();
         unset($html);
         unset($content);
         //sleep a while
         usleep(2500);
         //do recursive
         if ($can_next && $next_link != '') {
             self::_Do_Craw_Content($next_link, $md5sum_to_check);
         }
     } else {
         self::_print_to_console('Cant get content for:<br/>' . $start_uri);
     }
 }
示例#2
0
 private function _process_tmp_content($_HTML_CONTENT, $_URI, $_IMG, $_TITLE)
 {
     if ($_HTML_CONTENT != '') {
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($_HTML_CONTENT);
         unset($_HTML_CONTENT);
         //begin find elements
         $horoscope = new Model_Horoscope();
         $summary = $html->find('h2[class="detail_sapo"]', 0);
         if ($summary) {
             $sum_tmp = trim($summary->plaintext);
             $sum_tmp = str_replace('  ', ' ', $sum_tmp);
             $sum_tmp = str_replace(array('(2sao) -', '(2sao)-', '(2Sao) -', '(2Sao)-'), '', $sum_tmp);
             $horoscope->summary = trim($sum_tmp);
             unset($sum_tmp);
         } else {
             $horoscope->summary = 'not-set';
         }
         unset($summary);
         $content = $html->find('div[class="detail_content"]', 0);
         if ($content) {
             $content_arr = $content->innertext;
             //remove word format
             $content_arr = preg_replace('/(<!--.+?-->)/s', '', $content_arr);
             //remove any comments
             //$content_arr = preg_replace('/<script.*>.*<\/script>/s', '', $content_arr); //remove any comments
             $content_arr = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $content_arr);
             //$content_arr = preg_replace('/<meta.*>/i', '', $content_arr);
             //$content_arr = preg_replace('/<link.*>/i', '', $content_arr);
             $content_arr = preg_replace('/<style><\\/style>/ismxU', '', $content_arr);
             $content_arr = str_replace(array("\r\n", "\r", "\n", "\t"), '', $content_arr);
             //$content_arr = preg_replace('/<link[ ]rel="File-List"[ ]href=".*">/i', '', $content_arr);
             $content_arr = trim($content_arr);
             $horoscope->content = $content_arr;
         }
         unset($content);
         $date_create = $html->find('div[class="detail_date"]', 0);
         if ($date_create) {
             $d = $date_create->plaintext;
             //Thứ hai, 13/09/2010 23:25
             //get date :D
             list($thu, $dnt) = explode(',', $d);
             //$dnt = 13/09/2010 23:25
             $dnt = trim($dnt);
             list($ngay_thang_nam, $gio) = explode(' ', $dnt);
             list($ngay, $thang, $nam) = explode('/', $ngay_thang_nam);
             $str_date = $nam . "-" . $thang . "-" . $ngay . " " . $gio . ":00";
             $horoscope->date_created = strtotime($str_date);
         } else {
             //set to current
             $horoscope->date_created = time();
         }
         unset($date_create);
         //get img and save to local
         $path = 'assets/horoscope/images/';
         $r = Vendor_Crawler::get_file_from_url_by_curl($_IMG, $path, $file_name_to_set = '');
         if ($r != false) {
             //$this->_print_to_console(__("DONE: " . $r));
             $horoscope->img_thumb = "/" . $r;
         } else {
             $horoscope->img_thumb = $_IMG;
             //$this->_print_to_console(__("<--- CANT GET FILE --->"));
         }
         $horoscope->title = $_TITLE;
         $horoscope->source_uri = $_URI;
         $horoscope->page_path = Vendor_Crawler::toAscii($_TITLE);
         $horoscope->meta_keys = implode(',', explode(' ', $_TITLE));
         $horoscope->meta_desc = $horoscope->summary;
         $horoscope->save();
         if ($horoscope->identifier()) {
             if (Model_HoroscopeTempBLL::DeleteByURI($_URI) > 0) {
                 $this->_print_to_console(__("DONE: " . $_URI));
             } else {
                 $this->_print_to_console(__("<--- FALSE TO DELETE IN TMP TABLE! --->"));
             }
         } else {
             $this->_print_to_console(__("<--- FALSE TO SAVE NEW RECORD! --->"));
         }
         $html->clear();
         unset($html);
     } else {
         $this->_print_to_console(__("<--- NOTHING TO DO --->"));
     }
 }