Ejemplo n.º 1
0
 public function actionDetail()
 {
     $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1";
     $row = $this->db_crawler->createCommand($query)->queryRow();
     if (empty($row)) {
         die('het roi');
     }
     echo "<pre>" . print_r($row, true) . "</pre>";
     $url = $row['url'];
     $response = fectchContent($url);
     $html = str_get_html($response);
     $title = trim($html->find('title', 0)->innertext());
     $title = str_replace(' - CNET', '', $title);
     $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content);
     $meta_keywords = $html->find('meta[itemprop="keywords"]', 0);
     if (!empty($meta_keywords)) {
         $meta_keywords = trim($meta_keywords->content);
     }
     $meta_description = $html->find('meta[name="description"]', 0);
     if (!empty($meta_description)) {
         $meta_description = trim($meta_description->content);
     }
     //page 1
     $content1 = $html->find('#editorReview', 0);
     $this->processContent($title, $content1);
     //page 2
     $url .= '2/';
     $html = file_get_html($url);
     $content2 = $html->find('#editorReview', 0);
     $this->processContent($title, $content2);
     $content = $content1 . $content2;
     $tags = array();
     $list_images = array();
     $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' - CNET', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content, 'source_id' => $this->_source_id, 'source_url' => $row['url'], 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time());
     yii_insert_row('archive', $values, 'db_crawler');
     $this->crawlerSuccess($row);
 }
Ejemplo n.º 2
0
 public function actionDetail()
 {
     $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1";
     $row = $this->db_crawler->createCommand($query)->queryRow();
     if (empty($row)) {
         die('het roi');
     }
     $url = $row['url'];
     $response = fectchContent($url);
     $html = str_get_html($response);
     //div.article-main-body:first
     //.col-8:first
     //thumbnail
     $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content);
     //lay thong tin gallery
     $gallery = array();
     $html_gallery = $html->find('.slideshowify', 0);
     if (!empty($html_gallery)) {
         $imgs = $html_gallery->find('img');
         foreach ($imgs as $img) {
             $attr = 'data-full-size-image';
             $gallery['images'][] = trim($img->{$attr});
         }
     }
     //content
     $content = $html->find('div.article-entry', 0);
     if (empty($content)) {
         echo '<h1>Error</h1>';
         $this->crawlerSuccess($row, 2);
         die;
     }
     $remove_elements = array('.byline', '.inset-ad', '.inset-sm', '.social-share', '.slideshowify', '.aside-related-articles', '.native-ad-mobile', 'script', 'small');
     foreach ($content->find(implode(', ', $remove_elements)) as $item) {
         $item->outertext = '';
     }
     //cancel link in content
     $links = $content->find('a');
     foreach ($links as $item) {
         $href = trim($item->href);
         if (empty($href) | strpos($href, '#') === 0) {
             $item->outertext = '';
         } else {
             $inner_text = trim($item->innertext());
             $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>';
         }
     }
     //lay danh sach tags
     $tags = array();
     $html_tags = $html->find('div.accordion a');
     if (!empty($html_tags)) {
         foreach ($html_tags as $item) {
             $tags[] = trim($item->innertext());
         }
     } else {
         $html_tags = $html->find('.article-header .tag-item');
         if (!empty($html_tags)) {
             foreach ($html_tags as $item) {
                 $obj = $item->find('a', 0);
                 $tags[] = trim($obj->innertext());
             }
         }
     }
     $title = trim($html->find('title', 0)->innertext());
     $parent = "/\\s+/ims";
     $title = preg_replace($parent, ' ', $title);
     //meta_keywords
     //meta itemprop="keywords"
     $meta_keywords = $html->find('meta[itemprop="keywords"]', 0);
     if (!empty($meta_keywords)) {
         $meta_keywords = trim($meta_keywords->content);
     }
     $meta_description = $html->find('meta[name="description"]', 0);
     if (!empty($meta_description)) {
         $meta_description = trim($meta_description->content);
     }
     $content = str_replace('<!-- Begin: Wordpress Article Content -->', '', $content->outertext);
     $content = str_replace('<!-- End: Wordpress Article Content -->', '', $content);
     $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | TechCrunch', '', $title), 'thumbnail' => $thumbnail, 'tags' => json_encode($tags), 'content' => $content, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'gallery' => json_encode($gallery), 'created' => time());
     echo "<pre>" . print_r($row, true) . "</pre>";
     yii_insert_row('archive', $values, 'db_crawler');
     $this->crawlerSuccess($row);
 }
Ejemplo n.º 3
0
 public function actionDetail()
 {
     $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1";
     $row = $this->db_crawler->createCommand($query)->queryRow();
     if (empty($row)) {
         die('het roi');
     }
     echo "<pre>" . print_r($row, true) . "</pre>";
     $url = $row['url'];
     $response = fectchContent($url);
     $html = str_get_html($response);
     if (empty($html)) {
         echo 'Error';
         $this->crawlerSuccess($row, 2);
         die;
     }
     //div.article-main-body:first
     //.col-8:first
     //thumbnail
     $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content);
     $content = $html->find('.article-body', 0);
     $list_images = array();
     //        $gallery = $content->find('div.gallery', 0);
     //        if (!empty($gallery)) {
     //            $gallery_href = trim($gallery->find('.imageLinkWrapper', 0)->href);
     //            $list_images = $this->getImages($gallery_href);
     //        }
     $remove_elements = array();
     if (!empty($remove_elements)) {
         foreach ($content->find(implode(', ', $remove_elements)) as $item) {
             $item->outertext = '';
         }
     }
     //        $attr = 'data-original';
     //        $imgs = $content->find('.originalImage img, .imageContainer img');
     //        if (!empty($imgs)) {
     //            foreach ($imgs as $item) {
     //                $src = trim($item->src);
     //                if (empty($src)) {
     //                    $item->src = trim($item->$attr);
     //                    $item->$attr = '';
     //                }
     //            }
     //        }
     $links = $content->find('a');
     foreach ($links as $item) {
         $href = trim($item->href);
         if (empty($href) | strpos($href, '#') === 0) {
             $item->outertext = '';
         } else {
             $inner_text = trim($item->innertext());
             $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>';
         }
     }
     //lay danh sach tags
     $tags = array();
     //        $html_tags = $html->find('div.collections-topics-and-tags a');
     //        if (!empty($html_tags)) {
     //            foreach ($html_tags as $item) {
     //                $tags[] = trim($item->innertext());
     //            }
     //        }
     $title = trim($html->find('title', 0)->innertext());
     //meta_keywords
     //meta itemprop="keywords"
     $meta_keywords = $html->find('meta[itemprop="keywords"]', 0);
     if (!empty($meta_keywords)) {
         $meta_keywords = trim($meta_keywords->content);
     }
     $meta_description = $html->find('meta[name="description"]', 0);
     if (!empty($meta_description)) {
         $meta_description = trim($meta_description->content);
     }
     $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | Windows Central', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content->outertext, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time());
     yii_insert_row('archive', $values, 'db_crawler');
     $this->crawlerSuccess($row);
 }
Ejemplo n.º 4
0
 public function actionDetail()
 {
     $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1";
     $row = $this->db_crawler->createCommand($query)->queryRow();
     if (empty($row)) {
         die('het roi');
     }
     echo "<pre>" . print_r($row, true) . "</pre>";
     //        die;
     $url = $row['url'];
     //        $url = 'http://techz24.vn/abc.html';
     $response = fectchContent($url);
     $html = str_get_html($response);
     if (empty($html)) {
         echo 'Error';
         $this->crawlerSuccess($row, 2);
         die;
     }
     //div.article-main-body:first
     //.col-8:first
     //thumbnail
     $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content);
     $content = $html->find('.article-body', 0);
     if (empty($content)) {
         echo 'Error';
         $this->crawlerSuccess($row, 2);
         die;
     }
     $list_images = array();
     //lay thong tin anh neu co
     $media = $content->find('.media-gallery', 0);
     if (!empty($media)) {
         $attr = 'data-big';
         $imgs = $media->find('img');
         $list_images['title'] = '';
         foreach ($imgs as $img) {
             $list_images['images'][] = $this->_domain . trim($img->{$attr});
         }
     }
     $remove_elements = array('script', 'noscript', '.media-gallery', '.devicebox', 'style');
     if (!empty($remove_elements)) {
         foreach ($content->find(implode(', ', $remove_elements)) as $item) {
             $item->outertext = '';
         }
     }
     $links = $content->find('a');
     foreach ($links as $item) {
         $href = trim($item->href);
         if (empty($href) | strpos($href, '#') === 0) {
             $item->outertext = '';
         } else {
             $inner_text = trim($item->innertext());
             $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>';
         }
     }
     //lay danh sach tags
     $tags = array();
     $title = trim($html->find('title', 0)->innertext());
     //meta_keywords
     //meta itemprop="keywords"
     $meta_keywords = $html->find('meta[itemprop="keywords"]', 0);
     if (!empty($meta_keywords)) {
         $meta_keywords = trim($meta_keywords->content);
     }
     $meta_description = $html->find('meta[name="description"]', 0);
     if (!empty($meta_description)) {
         $meta_description = trim($meta_description->content);
     }
     $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | Android Central', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content->outertext, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time());
     yii_insert_row('archive', $values, 'db_crawler');
     $this->crawlerSuccess($row);
 }