public static function _Process_Recieved_Content($_HTML_CONTENT, $_Cung1, $_Cung2, $_Summary, $_SourceUri, $_LinkId, $_ImageLink) { if ($_HTML_CONTENT != '') { // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($_HTML_CONTENT); unset($_HTML_CONTENT); if ($html) { $story = new Model_Horoscope_XungHop(); $ktra = true; if ($_Cung1 == '-' || $_Cung2 == '-') { $ktra = FALSE; } $story->cung_1 = $_Cung1; $story->cung_2 = $_Cung2; $story->alias = $_Cung1 . '_' . $_Cung2; if (self::CheckRecordByAlias($story->alias)) { $story->alias = $_Cung1 . '__' . $_Cung2; } $story->tom_tat = $_Summary; $story->ngay_tao = date("Y-m-d"); $story->url_nguon = $_SourceUri; $story->auto_get = true; //begin find elements #find date post $date = $html->find('div[class="datetime"]', 0); if ($date) { $d = explode(',', $date->plaintext); if (isset($d[1])) { //var_dump($d); //exit; $d1 = explode(' ', trim($d[1])); list($ngay, $thang, $nam) = explode('/', $d1[0]); $story->source_date = date("Y-m-d h:i:s", strtotime($nam . '-' . $thang . '-' . $ngay . ' ' . $d1[1] . ':00')); } else { $story->source_date = date("Y-m-d h:i:s"); } } else { $story->source_date = date("Y-m-d h:i:s"); } //find content $content = $html->find('div[id="content_document"]', 0); if ($content) { $string = $content->innertext; # remove white space $string = str_replace(array("\r\n", "\r", "\n", "\t"), '', $string); $string = preg_replace('/(<!--.+?-->)/s', '', $string); $string = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $string); $string = preg_replace('/<p[ ]class="pAuthor">.*<\\/p>/ismxU', '', $string); $string = preg_replace('/<p[ ]class="pSource">.*<\\/p>/ismxU', '', $string); $story->noi_dung = $string; $story->kiem_tra = $ktra; $story->save(); if ($story->identifier()) { if ($ktra) { //get image thumb => save to disk => update record in db $path = 'assets/horoscope/xung-hop/' . $story->alias . '/'; $img = Vendor_Crawler::get_file_from_url_by_curl($_ImageLink, $save_to_path = $path, $file_name_to_set = $story->alias . '-thumb'); if ($img) { //check file size, if = 0 -> mean file can't get if (filesize($img) == 0) { @copy('assets/horoscope/thumb_140.jpg', $img); } $story->hinh_anh = '/' . $img; } else { $story->hinh_anh = $_ImageLink; } } else { $story->hinh_anh = $_ImageLink; } if ($ktra != FALSE) { //print_r($img); $html2 = new simple_html_dom(); $html2->load($story->noi_dung); $images = $html2->find('img'); if (count($images) > 0) { for ($i = 0; $i < count($images); $i++) { unset($images[$i]->onclick); $file_name = 'anh_' . $i + 1; $get_file = Vendor_Crawler::get_file_from_url_by_curl($images[$i]->src, $save_to_path = $path, $file_name_to_set = $file_name); if (filesize(ltrim($get_file, '/')) == 0) { unset($images[$i]); } else { $images[$i]->src = '/' . $get_file; } } } $story->noi_dung = $html2->save(); $html2->clear(); unset($html2); } else { $story->hinh_anh = $_ImageLink; } $story->save(); //insert done => update from tmp table Model_Horoscope_XungHopLinkBLL::UpdateRecordStatus($_LinkId); self::_print_to_console('Done: ' . $_SourceUri); } else { self::_print_to_console('Fail:' . $_SourceUri); } } else { self::_print_to_console('-> content not found'); return false; } $html->clear(); unset($html); } else { self::_print_to_console('-> cant load DOM obj'); return false; } } else { self::_print_to_console('-> nothing to do'); return false; } }
private function do_get_temp() { $five_rows = Model_Horoscope_XungHopLinkBLL::GetRecords_UnProccessed($num_of_rows = 10); if ($five_rows) { $count = count($five_rows); for ($i = 0; $i < $count; $i++) { $_ID = $five_rows[$i]['id']; $_URI = $five_rows[$i]['uri']; $_Cung1 = $five_rows[$i]['cung_1']; $_Cung2 = $five_rows[$i]['cung_2']; $_Summary = $five_rows[$i]['summary']; $_ImageLink = $five_rows[$i]['hinh_anh']; //check uri in story table if (Model_Horoscope_XungHopBLL::CheckRecordByURI($_URI)) { //(dup) existed => not need to insert => update from tmp table Model_Horoscope_XungHopLinkBLL::MarkAsDone($_ID, $_URI); } else { //begin get content $content = Vendor_Crawler::get_content_from_uri_by_curl($_URI); if ($content) { //process content just got Model_Horoscope_XungHopBLL::_Process_Recieved_Content($content['output'], $_Cung1, $_Cung2, $_Summary, $_URI, $_ID, $_ImageLink); } else { Model_Horoscope_XungHopBLL::_print_to_console(__("<--- CANT GET CONTENT --->")); } } } //EOF for loop } else { unset($five_rows); return false; } unset($five_rows); //do recursive $this->do_get_temp(); }