/** * Craw content from start url to the last url which got * @param string $start_uri: The main start url to check * @param string $md5sum_to_check: The last md5 checksum of url which is have been got * */ public static function _Do_Craw_Content($start_uri, $md5sum_to_check) { self::_print_to_console('Begin fetch URI:<br/>' . $start_uri); $can_next = true; $next_link = ''; $backet = array(); $content = Vendor_Crawler::get_content_from_uri_by_curl($start_uri); if ($content) { self::_print_to_console('Get content done!'); // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($content['output']); //begin find content $container = $html->find('div[id="TopicList"]', 0); if ($container) { //begin get story $stories = $container->find('div[class="cont09"]'); if (count($stories) > 0) { //begin parse each story $index = 0; foreach ($stories as $story) { if ($can_next) { # find title,image,summary => send to $backet $ok = true; $str = array(); $img = $story->find('img', 0); if ($img) { $str['image'] = $img->src; } else { $ok = false; } unset($img); $lnk = $story->find('a[class="lnk02"]', 0); if ($lnk) { $str['md5sum'] = md5($lnk->href); if ($str['md5sum'] != $md5sum_to_check) { $str['link'] = $lnk->href; $str['title'] = trim($lnk->plaintext); # $str['cung_1'] = $matches[]; $str['slug'] = Vendor_Crawler::toAscii($str['title']); $pattern = '/([a-z]+[-]+[a-z]+)[-]va[-]([a-z]+[-]+[a-z]+)[-].*/ismUx'; preg_match($pattern, $str['slug'], $matches); if (isset($matches[1]) && isset($matches[2])) { $str['cung_1'] = $matches[1]; $str['cung_2'] = $matches[2]; } else { $pattern = '/hai[-]nguoi[-]cung[-]+([a-z]+[-]+[a-z]+)[-].*/ismUx'; preg_match($pattern, $str['slug'], $matches); if (isset($matches[1])) { $str['cung_1'] = $str['cung_2'] = $matches[1]; } else { $str['cung_1'] = $str['cung_2'] = '-'; } } } else { $can_next = false; } # $str['cung_1'] = $matches; } else { $ok = false; } unset($lnk); if ($can_next) { $summary = $story->find('p[class="cont09txt"]', 0); if ($summary) { $str['summary'] = trim($summary->plaintext); } else { $ok = false; } } if ($ok && $can_next) { $str['date_check'] = date("Y-m-d h:i:s"); $backet[$index] = $str; unset($str); $index++; } } else { return; } } //EOF foreach # get next page link if ($can_next) { $tmp = explode('?p=', $start_uri); if (isset($tmp[1])) { $next_link = $tmp[0] . "?p=" . (intval($tmp[1]) + 1); } else { # next page is the second $next_link = $tmp[0] . "?p=2"; } //echo $next_link; } //var_dump($backet); //insert backet into db if (count($backet > 0)) { $succ = 0; $failure = 0; $dupp = 0; foreach ($backet as $bag) { if (Model_Horoscope_XungHopLinkBLL::CheckRecordByMd5Sum($bag['md5sum'])) { $dupp++; } else { $item = new Model_Horoscope_XungHopLink(); $item->uri = $bag['link']; $item->md5sum = $bag['md5sum']; $item->title = $bag['title']; $item->hinh_anh = $bag['image']; $item->ngay_check = $bag['date_check']; $item->summary = $bag['summary']; $item->cung_1 = $bag['cung_1']; $item->cung_2 = $bag['cung_2']; $item->hoan_thanh = false; $item->save(); if ($item->identifier()) { $succ++; } else { $failure++; } } } } self::_print_to_console('--> INSERT: ' . $succ . ' (ok) , ' . $failure . ' (false) , ' . $dupp . ' (duplicate) <-- <br/>'); flush(); } else { self::_print_to_console('--> No story found!'); $can_next = false; } } else { self::_print_to_console('Cant get main container!'); } $html->clear(); unset($html); unset($content); //sleep a while usleep(2500); //do recursive if ($can_next && $next_link != '') { self::_Do_Craw_Content($next_link, $md5sum_to_check); } } else { self::_print_to_console('Cant get content for:<br/>' . $start_uri); } }
private function _process_tmp_content($_HTML_CONTENT, $_URI, $_IMG, $_TITLE) { if ($_HTML_CONTENT != '') { // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($_HTML_CONTENT); unset($_HTML_CONTENT); //begin find elements $horoscope = new Model_Horoscope(); $summary = $html->find('h2[class="detail_sapo"]', 0); if ($summary) { $sum_tmp = trim($summary->plaintext); $sum_tmp = str_replace(' ', ' ', $sum_tmp); $sum_tmp = str_replace(array('(2sao) -', '(2sao)-', '(2Sao) -', '(2Sao)-'), '', $sum_tmp); $horoscope->summary = trim($sum_tmp); unset($sum_tmp); } else { $horoscope->summary = 'not-set'; } unset($summary); $content = $html->find('div[class="detail_content"]', 0); if ($content) { $content_arr = $content->innertext; //remove word format $content_arr = preg_replace('/(<!--.+?-->)/s', '', $content_arr); //remove any comments //$content_arr = preg_replace('/<script.*>.*<\/script>/s', '', $content_arr); //remove any comments $content_arr = preg_replace('@<a[^>]*>(.*)</a>@ismUx', '$1', $content_arr); //$content_arr = preg_replace('/<meta.*>/i', '', $content_arr); //$content_arr = preg_replace('/<link.*>/i', '', $content_arr); $content_arr = preg_replace('/<style><\\/style>/ismxU', '', $content_arr); $content_arr = str_replace(array("\r\n", "\r", "\n", "\t"), '', $content_arr); //$content_arr = preg_replace('/<link[ ]rel="File-List"[ ]href=".*">/i', '', $content_arr); $content_arr = trim($content_arr); $horoscope->content = $content_arr; } unset($content); $date_create = $html->find('div[class="detail_date"]', 0); if ($date_create) { $d = $date_create->plaintext; //Thứ hai, 13/09/2010 23:25 //get date :D list($thu, $dnt) = explode(',', $d); //$dnt = 13/09/2010 23:25 $dnt = trim($dnt); list($ngay_thang_nam, $gio) = explode(' ', $dnt); list($ngay, $thang, $nam) = explode('/', $ngay_thang_nam); $str_date = $nam . "-" . $thang . "-" . $ngay . " " . $gio . ":00"; $horoscope->date_created = strtotime($str_date); } else { //set to current $horoscope->date_created = time(); } unset($date_create); //get img and save to local $path = 'assets/horoscope/images/'; $r = Vendor_Crawler::get_file_from_url_by_curl($_IMG, $path, $file_name_to_set = ''); if ($r != false) { //$this->_print_to_console(__("DONE: " . $r)); $horoscope->img_thumb = "/" . $r; } else { $horoscope->img_thumb = $_IMG; //$this->_print_to_console(__("<--- CANT GET FILE --->")); } $horoscope->title = $_TITLE; $horoscope->source_uri = $_URI; $horoscope->page_path = Vendor_Crawler::toAscii($_TITLE); $horoscope->meta_keys = implode(',', explode(' ', $_TITLE)); $horoscope->meta_desc = $horoscope->summary; $horoscope->save(); if ($horoscope->identifier()) { if (Model_HoroscopeTempBLL::DeleteByURI($_URI) > 0) { $this->_print_to_console(__("DONE: " . $_URI)); } else { $this->_print_to_console(__("<--- FALSE TO DELETE IN TMP TABLE! --->")); } } else { $this->_print_to_console(__("<--- FALSE TO SAVE NEW RECORD! --->")); } $html->clear(); unset($html); } else { $this->_print_to_console(__("<--- NOTHING TO DO --->")); } }