private function do_get_temp() { $five_rows = Model_Horoscope_XungHopLinkBLL::GetRecords_UnProccessed($num_of_rows = 10); if ($five_rows) { $count = count($five_rows); for ($i = 0; $i < $count; $i++) { $_ID = $five_rows[$i]['id']; $_URI = $five_rows[$i]['uri']; $_Cung1 = $five_rows[$i]['cung_1']; $_Cung2 = $five_rows[$i]['cung_2']; $_Summary = $five_rows[$i]['summary']; $_ImageLink = $five_rows[$i]['hinh_anh']; //check uri in story table if (Model_Horoscope_XungHopBLL::CheckRecordByURI($_URI)) { //(dup) existed => not need to insert => update from tmp table Model_Horoscope_XungHopLinkBLL::MarkAsDone($_ID, $_URI); } else { //begin get content $content = Vendor_Crawler::get_content_from_uri_by_curl($_URI); if ($content) { //process content just got Model_Horoscope_XungHopBLL::_Process_Recieved_Content($content['output'], $_Cung1, $_Cung2, $_Summary, $_URI, $_ID, $_ImageLink); } else { Model_Horoscope_XungHopBLL::_print_to_console(__("<--- CANT GET CONTENT --->")); } } } //EOF for loop } else { unset($five_rows); return false; } unset($five_rows); //do recursive $this->do_get_temp(); }
/** * Craw content from start url to the last url which got * @param string $start_uri: The main start url to check * @param string $md5sum_to_check: The last md5 checksum of url which is have been got * */ public static function _Do_Craw_Content($start_uri, $md5sum_to_check) { self::_print_to_console('Begin fetch URI:<br/>' . $start_uri); $can_next = true; $next_link = ''; $backet = array(); $content = Vendor_Crawler::get_content_from_uri_by_curl($start_uri); if ($content) { self::_print_to_console('Get content done!'); // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($content['output']); //begin find content $container = $html->find('div[id="TopicList"]', 0); if ($container) { //begin get story $stories = $container->find('div[class="cont09"]'); if (count($stories) > 0) { //begin parse each story $index = 0; foreach ($stories as $story) { if ($can_next) { # find title,image,summary => send to $backet $ok = true; $str = array(); $img = $story->find('img', 0); if ($img) { $str['image'] = $img->src; } else { $ok = false; } unset($img); $lnk = $story->find('a[class="lnk02"]', 0); if ($lnk) { $str['md5sum'] = md5($lnk->href); if ($str['md5sum'] != $md5sum_to_check) { $str['link'] = $lnk->href; $str['title'] = trim($lnk->plaintext); # $str['cung_1'] = $matches[]; $str['slug'] = Vendor_Crawler::toAscii($str['title']); $pattern = '/([a-z]+[-]+[a-z]+)[-]va[-]([a-z]+[-]+[a-z]+)[-].*/ismUx'; preg_match($pattern, $str['slug'], $matches); if (isset($matches[1]) && isset($matches[2])) { $str['cung_1'] = $matches[1]; $str['cung_2'] = $matches[2]; } else { $pattern = '/hai[-]nguoi[-]cung[-]+([a-z]+[-]+[a-z]+)[-].*/ismUx'; preg_match($pattern, $str['slug'], $matches); if (isset($matches[1])) { $str['cung_1'] = $str['cung_2'] = $matches[1]; } else { $str['cung_1'] = $str['cung_2'] = '-'; } } } else { $can_next = false; } # $str['cung_1'] = $matches; } else { $ok = false; } unset($lnk); if ($can_next) { $summary = $story->find('p[class="cont09txt"]', 0); if ($summary) { $str['summary'] = trim($summary->plaintext); } else { $ok = false; } } if ($ok && $can_next) { $str['date_check'] = date("Y-m-d h:i:s"); $backet[$index] = $str; unset($str); $index++; } } else { return; } } //EOF foreach # get next page link if ($can_next) { $tmp = explode('?p=', $start_uri); if (isset($tmp[1])) { $next_link = $tmp[0] . "?p=" . (intval($tmp[1]) + 1); } else { # next page is the second $next_link = $tmp[0] . "?p=2"; } //echo $next_link; } //var_dump($backet); //insert backet into db if (count($backet > 0)) { $succ = 0; $failure = 0; $dupp = 0; foreach ($backet as $bag) { if (Model_Horoscope_XungHopLinkBLL::CheckRecordByMd5Sum($bag['md5sum'])) { $dupp++; } else { $item = new Model_Horoscope_XungHopLink(); $item->uri = $bag['link']; $item->md5sum = $bag['md5sum']; $item->title = $bag['title']; $item->hinh_anh = $bag['image']; $item->ngay_check = $bag['date_check']; $item->summary = $bag['summary']; $item->cung_1 = $bag['cung_1']; $item->cung_2 = $bag['cung_2']; $item->hoan_thanh = false; $item->save(); if ($item->identifier()) { $succ++; } else { $failure++; } } } } self::_print_to_console('--> INSERT: ' . $succ . ' (ok) , ' . $failure . ' (false) , ' . $dupp . ' (duplicate) <-- <br/>'); flush(); } else { self::_print_to_console('--> No story found!'); $can_next = false; } } else { self::_print_to_console('Cant get main container!'); } $html->clear(); unset($html); unset($content); //sleep a while usleep(2500); //do recursive if ($can_next && $next_link != '') { self::_Do_Craw_Content($next_link, $md5sum_to_check); } } else { self::_print_to_console('Cant get content for:<br/>' . $start_uri); } }
private function _do_craw($_DOMAIN, $uri, $uri_to_check) { $can_next = true; $next_link = ''; if ($uri == $uri_to_check) { $can_next = false; } $content = Vendor_Crawler::get_content_from_uri_by_curl($uri); if ($content && $can_next) { $this->_print_to_console('Done get content!'); // Create a DOM object require_once Kohana::find_file('classes', 'vendor/simple_html_dom'); $html = new simple_html_dom(); // Load HTML from a string $html->load($content['output']); $basket = array(); //begin find content $story_1 = $html->find('div[id="nbm"]', 0); if ($story_1) { echo "FOUND STORY 1!" . PHP_EOL; $lnk = $story_1->find('a', 0); $img = $story_1->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[0]['link'] = $lnk->href; $basket[0]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[0]['img'] = $img->src; } } } unset($story_1); if ($can_next) { $story_2 = $html->find('li[id="nbm_1"]', 0); if ($story_2) { echo "FOUND STORY 2!" . PHP_EOL; $lnk = $story_2->find('a', 0); $img = $lnk->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[1]['link'] = $lnk->href; $basket[1]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[1]['img'] = $img->src; } } } unset($story_2); } if ($can_next) { $story_3 = $html->find('li[id="nbm_2"]', 0); if ($story_3) { echo "FOUND STORY 3!" . PHP_EOL; $lnk = $story_3->find('a', 0); $img = $lnk->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[2]['link'] = $lnk->href; $basket[2]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[2]['img'] = $img->src; } } } unset($story_3); } if ($can_next) { $story_4 = $html->find('li[id="nbm_3"]', 0); if ($story_4) { echo "FOUND STORY 4!" . PHP_EOL; $lnk = $story_4->find('a', 0); $img = $lnk->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[3]['link'] = $lnk->href; $basket[3]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[3]['img'] = $img->src; } } } unset($story_4); } if ($can_next) { $others = $html->find('div[id="listNews"]', 0); if ($others) { echo "BEGIN FIND OTHERS!" . PHP_EOL; //begin find each story (11 childrens) //echo count($others->children()); $count = count($others->children()); //$a = 0; for ($i = 1; $i < $count; $i++) { if ($can_next) { //each child contain 2 story; $st1 = $others->children($i)->children(0); if ($st1) { $lnk = $st1->find('a', 1); $img = $st1->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[$i * 3 + 1]['link'] = $lnk->href; $basket[$i * 3 + 1]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[$i * 3 + 1]['img'] = $img->src; } } //echo "FOUND 1: ".$st1->plaintext.PHP_EOL; //$a++; } unset($st1); $st2 = $others->children($i)->children(1); if ($st2) { $lnk = $st2->find('a', 1); $img = $st2->find('img', 0); if ($lnk) { if ($_DOMAIN . trim($lnk->href) != $uri_to_check) { $basket[$i * 3 + 2]['link'] = $lnk->href; $basket[$i * 3 + 2]['title'] = $lnk->title; } else { $can_next = false; } } if ($can_next) { if ($img) { $basket[$i * 3 + 2]['img'] = $img->src; } } //$a++; //echo "FOUND 2: ".$st2->plaintext.PHP_EOL; } unset($st2); } //EOF check can go next } //EOF for } //EOF count others unset($others); } if (isset($lnk)) { unset($lnk); } if (isset($img)) { unset($img); } // begin insert to db if (count($basket) > 0) { foreach ($basket as $ball) { $link = $_DOMAIN . $ball['link']; if (Model_HoroscopeBLL::CheckStoryByUri($link) == false) { if (Model_HoroscopeTempBLL::CheckRecordByURI($link) == false) { $c = new Model_HoroscopeTemp(); $c->title = Vendor_Crawler::unhtmlentities($ball['title']); $c->uri = $link; //only get image with width = 140px (max = 360px) //http://imgthumb.2sao.vietnamnet.vn/ThumbImages/2010/09/13/22/15/A19_360.jpg $str_arr = explode("/", urldecode($ball['img'])); //pops and returns the last value of the array, $img_name = array_pop($str_arr); if ($img_name) { //A19_360.jpg list($name, $ext) = explode('.', $img_name); list($ten_file, $size) = explode('_', $name); $size = '140'; //set always = 360px //begin join all part => one $ss = implode('_', array($ten_file, $size)); //A19_360 $ss = implode('.', array($ss, $ext)); //A19_360.jpg array_push($str_arr, $ss); $ball['img'] = implode("/", $str_arr); } $c->img = $ball['img']; $c->save(); unset($c); $this->_print_to_console(__("INSERT DONE!!!")); } else { $this->_print_to_console(__("URL existed, DO NOTHING!!!")); } } else { echo "DO NOTHING!!!" . PHP_EOL; } } } else { $this->_print_to_console(__("<--- NOTHING TO INSERT --->")); } unset($basket); if ($can_next) { //find next page link $pager = $html->find('div[class="pager"]', 0); if ($pager) { $links = $pager->find('a'); if (count($links) > 0) { $cur_page = 0; for ($i = 0; $i < count($links); $i++) { if ($links[$i]->class == 'active') { $cur_page = $i; } } if (isset($links[$cur_page + 1])) { $next_link = $links[$cur_page + 1]->href; $next_link = $_DOMAIN . $next_link; } } } unset($pager); } $html->clear(); unset($html); } else { //content not found or cant go next $this->_print_to_console(__("Content not found or cant go next!")); } unset($content); //go to next page if ($next_link != '') { $this->_do_craw($_DOMAIN, $next_link, $uri_to_check); } }