Exemplo n.º 1
0
 public static function _Process_Recieved_Content($_HTML_CONTENT, $_Cung1, $_Cung2, $_Summary, $_SourceUri, $_LinkId, $_ImageLink)
 {
     if ($_HTML_CONTENT != '') {
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($_HTML_CONTENT);
         unset($_HTML_CONTENT);
         if ($html) {
             $story = new Model_Horoscope_XungHop();
             $ktra = true;
             if ($_Cung1 == '-' || $_Cung2 == '-') {
                 $ktra = FALSE;
             }
             $story->cung_1 = $_Cung1;
             $story->cung_2 = $_Cung2;
             $story->alias = $_Cung1 . '_' . $_Cung2;
             if (self::CheckRecordByAlias($story->alias)) {
                 $story->alias = $_Cung1 . '__' . $_Cung2;
             }
             $story->tom_tat = $_Summary;
             $story->ngay_tao = date("Y-m-d");
             $story->url_nguon = $_SourceUri;
             $story->auto_get = true;
             //begin find elements
             #find date post
             $date = $html->find('div[class="datetime"]', 0);
             if ($date) {
                 $d = explode(',', $date->plaintext);
                 if (isset($d[1])) {
                     //var_dump($d);
                     //exit;
                     $d1 = explode(' ', trim($d[1]));
                     list($ngay, $thang, $nam) = explode('/', $d1[0]);
                     $story->source_date = date("Y-m-d h:i:s", strtotime($nam . '-' . $thang . '-' . $ngay . ' ' . $d1[1] . ':00'));
                 } else {
                     $story->source_date = date("Y-m-d h:i:s");
                 }
             } else {
                 $story->source_date = date("Y-m-d h:i:s");
             }
             //find content
             $content = $html->find('div[id="content_document"]', 0);
             if ($content) {
                 $string = $content->innertext;
                 # remove white space
                 $string = str_replace(array("\r\n", "\r", "\n", "\t"), '', $string);
                 $string = preg_replace('/(<!--.+?-->)/s', '', $string);
                 $string = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $string);
                 $string = preg_replace('/<p[ ]class="pAuthor">.*<\\/p>/ismxU', '', $string);
                 $string = preg_replace('/<p[ ]class="pSource">.*<\\/p>/ismxU', '', $string);
                 $story->noi_dung = $string;
                 $story->kiem_tra = $ktra;
                 $story->save();
                 if ($story->identifier()) {
                     if ($ktra) {
                         //get image thumb => save to disk => update record in db
                         $path = 'assets/horoscope/xung-hop/' . $story->alias . '/';
                         $img = Vendor_Crawler::get_file_from_url_by_curl($_ImageLink, $save_to_path = $path, $file_name_to_set = $story->alias . '-thumb');
                         if ($img) {
                             //check file size, if = 0 -> mean file can't get
                             if (filesize($img) == 0) {
                                 @copy('assets/horoscope/thumb_140.jpg', $img);
                             }
                             $story->hinh_anh = '/' . $img;
                         } else {
                             $story->hinh_anh = $_ImageLink;
                         }
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     if ($ktra != FALSE) {
                         //print_r($img);
                         $html2 = new simple_html_dom();
                         $html2->load($story->noi_dung);
                         $images = $html2->find('img');
                         if (count($images) > 0) {
                             for ($i = 0; $i < count($images); $i++) {
                                 unset($images[$i]->onclick);
                                 $file_name = 'anh_' . $i + 1;
                                 $get_file = Vendor_Crawler::get_file_from_url_by_curl($images[$i]->src, $save_to_path = $path, $file_name_to_set = $file_name);
                                 if (filesize(ltrim($get_file, '/')) == 0) {
                                     unset($images[$i]);
                                 } else {
                                     $images[$i]->src = '/' . $get_file;
                                 }
                             }
                         }
                         $story->noi_dung = $html2->save();
                         $html2->clear();
                         unset($html2);
                     } else {
                         $story->hinh_anh = $_ImageLink;
                     }
                     $story->save();
                     //insert done => update from tmp table
                     Model_Horoscope_XungHopLinkBLL::UpdateRecordStatus($_LinkId);
                     self::_print_to_console('Done: ' . $_SourceUri);
                 } else {
                     self::_print_to_console('Fail:' . $_SourceUri);
                 }
             } else {
                 self::_print_to_console('-> content not found');
                 return false;
             }
             $html->clear();
             unset($html);
         } else {
             self::_print_to_console('-> cant load DOM obj');
             return false;
         }
     } else {
         self::_print_to_console('-> nothing to do');
         return false;
     }
 }
Exemplo n.º 2
0
 private function do_get_temp()
 {
     $five_rows = Model_Horoscope_XungHopLinkBLL::GetRecords_UnProccessed($num_of_rows = 10);
     if ($five_rows) {
         $count = count($five_rows);
         for ($i = 0; $i < $count; $i++) {
             $_ID = $five_rows[$i]['id'];
             $_URI = $five_rows[$i]['uri'];
             $_Cung1 = $five_rows[$i]['cung_1'];
             $_Cung2 = $five_rows[$i]['cung_2'];
             $_Summary = $five_rows[$i]['summary'];
             $_ImageLink = $five_rows[$i]['hinh_anh'];
             //check uri in story table
             if (Model_Horoscope_XungHopBLL::CheckRecordByURI($_URI)) {
                 //(dup) existed => not need to insert => update from tmp table
                 Model_Horoscope_XungHopLinkBLL::MarkAsDone($_ID, $_URI);
             } else {
                 //begin get content
                 $content = Vendor_Crawler::get_content_from_uri_by_curl($_URI);
                 if ($content) {
                     //process content just got
                     Model_Horoscope_XungHopBLL::_Process_Recieved_Content($content['output'], $_Cung1, $_Cung2, $_Summary, $_URI, $_ID, $_ImageLink);
                 } else {
                     Model_Horoscope_XungHopBLL::_print_to_console(__("<--- CANT GET CONTENT --->"));
                 }
             }
         }
         //EOF for loop
     } else {
         unset($five_rows);
         return false;
     }
     unset($five_rows);
     //do recursive
     $this->do_get_temp();
 }