示例#1
0
 private function do_get_temp()
 {
     $five_rows = Model_Horoscope_XungHopLinkBLL::GetRecords_UnProccessed($num_of_rows = 10);
     if ($five_rows) {
         $count = count($five_rows);
         for ($i = 0; $i < $count; $i++) {
             $_ID = $five_rows[$i]['id'];
             $_URI = $five_rows[$i]['uri'];
             $_Cung1 = $five_rows[$i]['cung_1'];
             $_Cung2 = $five_rows[$i]['cung_2'];
             $_Summary = $five_rows[$i]['summary'];
             $_ImageLink = $five_rows[$i]['hinh_anh'];
             //check uri in story table
             if (Model_Horoscope_XungHopBLL::CheckRecordByURI($_URI)) {
                 //(dup) existed => not need to insert => update from tmp table
                 Model_Horoscope_XungHopLinkBLL::MarkAsDone($_ID, $_URI);
             } else {
                 //begin get content
                 $content = Vendor_Crawler::get_content_from_uri_by_curl($_URI);
                 if ($content) {
                     //process content just got
                     Model_Horoscope_XungHopBLL::_Process_Recieved_Content($content['output'], $_Cung1, $_Cung2, $_Summary, $_URI, $_ID, $_ImageLink);
                 } else {
                     Model_Horoscope_XungHopBLL::_print_to_console(__("<--- CANT GET CONTENT --->"));
                 }
             }
         }
         //EOF for loop
     } else {
         unset($five_rows);
         return false;
     }
     unset($five_rows);
     //do recursive
     $this->do_get_temp();
 }
示例#2
0
 /**
  * Craw content from start url to the last url which got
  * @param string $start_uri: The main start url to check
  * @param string $md5sum_to_check: The last md5 checksum of url which is have been got
  *
  */
 public static function _Do_Craw_Content($start_uri, $md5sum_to_check)
 {
     self::_print_to_console('Begin fetch URI:<br/>' . $start_uri);
     $can_next = true;
     $next_link = '';
     $backet = array();
     $content = Vendor_Crawler::get_content_from_uri_by_curl($start_uri);
     if ($content) {
         self::_print_to_console('Get content done!');
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($content['output']);
         //begin find content
         $container = $html->find('div[id="TopicList"]', 0);
         if ($container) {
             //begin get story
             $stories = $container->find('div[class="cont09"]');
             if (count($stories) > 0) {
                 //begin parse each story
                 $index = 0;
                 foreach ($stories as $story) {
                     if ($can_next) {
                         # find title,image,summary => send to $backet
                         $ok = true;
                         $str = array();
                         $img = $story->find('img', 0);
                         if ($img) {
                             $str['image'] = $img->src;
                         } else {
                             $ok = false;
                         }
                         unset($img);
                         $lnk = $story->find('a[class="lnk02"]', 0);
                         if ($lnk) {
                             $str['md5sum'] = md5($lnk->href);
                             if ($str['md5sum'] != $md5sum_to_check) {
                                 $str['link'] = $lnk->href;
                                 $str['title'] = trim($lnk->plaintext);
                                 # $str['cung_1'] = $matches[];
                                 $str['slug'] = Vendor_Crawler::toAscii($str['title']);
                                 $pattern = '/([a-z]+[-]+[a-z]+)[-]va[-]([a-z]+[-]+[a-z]+)[-].*/ismUx';
                                 preg_match($pattern, $str['slug'], $matches);
                                 if (isset($matches[1]) && isset($matches[2])) {
                                     $str['cung_1'] = $matches[1];
                                     $str['cung_2'] = $matches[2];
                                 } else {
                                     $pattern = '/hai[-]nguoi[-]cung[-]+([a-z]+[-]+[a-z]+)[-].*/ismUx';
                                     preg_match($pattern, $str['slug'], $matches);
                                     if (isset($matches[1])) {
                                         $str['cung_1'] = $str['cung_2'] = $matches[1];
                                     } else {
                                         $str['cung_1'] = $str['cung_2'] = '-';
                                     }
                                 }
                             } else {
                                 $can_next = false;
                             }
                             # $str['cung_1'] = $matches;
                         } else {
                             $ok = false;
                         }
                         unset($lnk);
                         if ($can_next) {
                             $summary = $story->find('p[class="cont09txt"]', 0);
                             if ($summary) {
                                 $str['summary'] = trim($summary->plaintext);
                             } else {
                                 $ok = false;
                             }
                         }
                         if ($ok && $can_next) {
                             $str['date_check'] = date("Y-m-d h:i:s");
                             $backet[$index] = $str;
                             unset($str);
                             $index++;
                         }
                     } else {
                         return;
                     }
                 }
                 //EOF foreach
                 # get next page link
                 if ($can_next) {
                     $tmp = explode('?p=', $start_uri);
                     if (isset($tmp[1])) {
                         $next_link = $tmp[0] . "?p=" . (intval($tmp[1]) + 1);
                     } else {
                         # next page is the second
                         $next_link = $tmp[0] . "?p=2";
                     }
                     //echo $next_link;
                 }
                 //var_dump($backet);
                 //insert backet into db
                 if (count($backet > 0)) {
                     $succ = 0;
                     $failure = 0;
                     $dupp = 0;
                     foreach ($backet as $bag) {
                         if (Model_Horoscope_XungHopLinkBLL::CheckRecordByMd5Sum($bag['md5sum'])) {
                             $dupp++;
                         } else {
                             $item = new Model_Horoscope_XungHopLink();
                             $item->uri = $bag['link'];
                             $item->md5sum = $bag['md5sum'];
                             $item->title = $bag['title'];
                             $item->hinh_anh = $bag['image'];
                             $item->ngay_check = $bag['date_check'];
                             $item->summary = $bag['summary'];
                             $item->cung_1 = $bag['cung_1'];
                             $item->cung_2 = $bag['cung_2'];
                             $item->hoan_thanh = false;
                             $item->save();
                             if ($item->identifier()) {
                                 $succ++;
                             } else {
                                 $failure++;
                             }
                         }
                     }
                 }
                 self::_print_to_console('--> INSERT: ' . $succ . ' (ok) , ' . $failure . ' (false) , ' . $dupp . ' (duplicate) <-- <br/>');
                 flush();
             } else {
                 self::_print_to_console('--> No story found!');
                 $can_next = false;
             }
         } else {
             self::_print_to_console('Cant get main container!');
         }
         $html->clear();
         unset($html);
         unset($content);
         //sleep a while
         usleep(2500);
         //do recursive
         if ($can_next && $next_link != '') {
             self::_Do_Craw_Content($next_link, $md5sum_to_check);
         }
     } else {
         self::_print_to_console('Cant get content for:<br/>' . $start_uri);
     }
 }
示例#3
0
 private function _do_craw($_DOMAIN, $uri, $uri_to_check)
 {
     $can_next = true;
     $next_link = '';
     if ($uri == $uri_to_check) {
         $can_next = false;
     }
     $content = Vendor_Crawler::get_content_from_uri_by_curl($uri);
     if ($content && $can_next) {
         $this->_print_to_console('Done get content!');
         // Create a DOM object
         require_once Kohana::find_file('classes', 'vendor/simple_html_dom');
         $html = new simple_html_dom();
         // Load HTML from a string
         $html->load($content['output']);
         $basket = array();
         //begin find content
         $story_1 = $html->find('div[id="nbm"]', 0);
         if ($story_1) {
             echo "FOUND STORY 1!" . PHP_EOL;
             $lnk = $story_1->find('a', 0);
             $img = $story_1->find('img', 0);
             if ($lnk) {
                 if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                     $basket[0]['link'] = $lnk->href;
                     $basket[0]['title'] = $lnk->title;
                 } else {
                     $can_next = false;
                 }
             }
             if ($can_next) {
                 if ($img) {
                     $basket[0]['img'] = $img->src;
                 }
             }
         }
         unset($story_1);
         if ($can_next) {
             $story_2 = $html->find('li[id="nbm_1"]', 0);
             if ($story_2) {
                 echo "FOUND STORY 2!" . PHP_EOL;
                 $lnk = $story_2->find('a', 0);
                 $img = $lnk->find('img', 0);
                 if ($lnk) {
                     if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                         $basket[1]['link'] = $lnk->href;
                         $basket[1]['title'] = $lnk->title;
                     } else {
                         $can_next = false;
                     }
                 }
                 if ($can_next) {
                     if ($img) {
                         $basket[1]['img'] = $img->src;
                     }
                 }
             }
             unset($story_2);
         }
         if ($can_next) {
             $story_3 = $html->find('li[id="nbm_2"]', 0);
             if ($story_3) {
                 echo "FOUND STORY 3!" . PHP_EOL;
                 $lnk = $story_3->find('a', 0);
                 $img = $lnk->find('img', 0);
                 if ($lnk) {
                     if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                         $basket[2]['link'] = $lnk->href;
                         $basket[2]['title'] = $lnk->title;
                     } else {
                         $can_next = false;
                     }
                 }
                 if ($can_next) {
                     if ($img) {
                         $basket[2]['img'] = $img->src;
                     }
                 }
             }
             unset($story_3);
         }
         if ($can_next) {
             $story_4 = $html->find('li[id="nbm_3"]', 0);
             if ($story_4) {
                 echo "FOUND STORY 4!" . PHP_EOL;
                 $lnk = $story_4->find('a', 0);
                 $img = $lnk->find('img', 0);
                 if ($lnk) {
                     if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                         $basket[3]['link'] = $lnk->href;
                         $basket[3]['title'] = $lnk->title;
                     } else {
                         $can_next = false;
                     }
                 }
                 if ($can_next) {
                     if ($img) {
                         $basket[3]['img'] = $img->src;
                     }
                 }
             }
             unset($story_4);
         }
         if ($can_next) {
             $others = $html->find('div[id="listNews"]', 0);
             if ($others) {
                 echo "BEGIN FIND OTHERS!" . PHP_EOL;
                 //begin find each story (11 childrens)
                 //echo count($others->children());
                 $count = count($others->children());
                 //$a = 0;
                 for ($i = 1; $i < $count; $i++) {
                     if ($can_next) {
                         //each child contain 2 story;
                         $st1 = $others->children($i)->children(0);
                         if ($st1) {
                             $lnk = $st1->find('a', 1);
                             $img = $st1->find('img', 0);
                             if ($lnk) {
                                 if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                                     $basket[$i * 3 + 1]['link'] = $lnk->href;
                                     $basket[$i * 3 + 1]['title'] = $lnk->title;
                                 } else {
                                     $can_next = false;
                                 }
                             }
                             if ($can_next) {
                                 if ($img) {
                                     $basket[$i * 3 + 1]['img'] = $img->src;
                                 }
                             }
                             //echo "FOUND 1: ".$st1->plaintext.PHP_EOL;
                             //$a++;
                         }
                         unset($st1);
                         $st2 = $others->children($i)->children(1);
                         if ($st2) {
                             $lnk = $st2->find('a', 1);
                             $img = $st2->find('img', 0);
                             if ($lnk) {
                                 if ($_DOMAIN . trim($lnk->href) != $uri_to_check) {
                                     $basket[$i * 3 + 2]['link'] = $lnk->href;
                                     $basket[$i * 3 + 2]['title'] = $lnk->title;
                                 } else {
                                     $can_next = false;
                                 }
                             }
                             if ($can_next) {
                                 if ($img) {
                                     $basket[$i * 3 + 2]['img'] = $img->src;
                                 }
                             }
                             //$a++;
                             //echo "FOUND 2: ".$st2->plaintext.PHP_EOL;
                         }
                         unset($st2);
                     }
                     //EOF check can go next
                 }
                 //EOF for
             }
             //EOF count others
             unset($others);
         }
         if (isset($lnk)) {
             unset($lnk);
         }
         if (isset($img)) {
             unset($img);
         }
         // begin insert to db
         if (count($basket) > 0) {
             foreach ($basket as $ball) {
                 $link = $_DOMAIN . $ball['link'];
                 if (Model_HoroscopeBLL::CheckStoryByUri($link) == false) {
                     if (Model_HoroscopeTempBLL::CheckRecordByURI($link) == false) {
                         $c = new Model_HoroscopeTemp();
                         $c->title = Vendor_Crawler::unhtmlentities($ball['title']);
                         $c->uri = $link;
                         //only get image with width = 140px (max = 360px)
                         //http://imgthumb.2sao.vietnamnet.vn/ThumbImages/2010/09/13/22/15/A19_360.jpg
                         $str_arr = explode("/", urldecode($ball['img']));
                         //pops and returns the last value of the array,
                         $img_name = array_pop($str_arr);
                         if ($img_name) {
                             //A19_360.jpg
                             list($name, $ext) = explode('.', $img_name);
                             list($ten_file, $size) = explode('_', $name);
                             $size = '140';
                             //set always = 360px
                             //begin join all part => one
                             $ss = implode('_', array($ten_file, $size));
                             //A19_360
                             $ss = implode('.', array($ss, $ext));
                             //A19_360.jpg
                             array_push($str_arr, $ss);
                             $ball['img'] = implode("/", $str_arr);
                         }
                         $c->img = $ball['img'];
                         $c->save();
                         unset($c);
                         $this->_print_to_console(__("INSERT DONE!!!"));
                     } else {
                         $this->_print_to_console(__("URL existed, DO NOTHING!!!"));
                     }
                 } else {
                     echo "DO NOTHING!!!" . PHP_EOL;
                 }
             }
         } else {
             $this->_print_to_console(__("<--- NOTHING TO INSERT --->"));
         }
         unset($basket);
         if ($can_next) {
             //find next page link
             $pager = $html->find('div[class="pager"]', 0);
             if ($pager) {
                 $links = $pager->find('a');
                 if (count($links) > 0) {
                     $cur_page = 0;
                     for ($i = 0; $i < count($links); $i++) {
                         if ($links[$i]->class == 'active') {
                             $cur_page = $i;
                         }
                     }
                     if (isset($links[$cur_page + 1])) {
                         $next_link = $links[$cur_page + 1]->href;
                         $next_link = $_DOMAIN . $next_link;
                     }
                 }
             }
             unset($pager);
         }
         $html->clear();
         unset($html);
     } else {
         //content not found or cant go next
         $this->_print_to_console(__("Content not found or cant go next!"));
     }
     unset($content);
     //go to next page
     if ($next_link != '') {
         $this->_do_craw($_DOMAIN, $next_link, $uri_to_check);
     }
 }