public function actionDetail() { $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1"; $row = $this->db_crawler->createCommand($query)->queryRow(); if (empty($row)) { die('het roi'); } echo "<pre>" . print_r($row, true) . "</pre>"; $url = $row['url']; $response = fectchContent($url); $html = str_get_html($response); $title = trim($html->find('title', 0)->innertext()); $title = str_replace(' - CNET', '', $title); $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content); $meta_keywords = $html->find('meta[itemprop="keywords"]', 0); if (!empty($meta_keywords)) { $meta_keywords = trim($meta_keywords->content); } $meta_description = $html->find('meta[name="description"]', 0); if (!empty($meta_description)) { $meta_description = trim($meta_description->content); } //page 1 $content1 = $html->find('#editorReview', 0); $this->processContent($title, $content1); //page 2 $url .= '2/'; $html = file_get_html($url); $content2 = $html->find('#editorReview', 0); $this->processContent($title, $content2); $content = $content1 . $content2; $tags = array(); $list_images = array(); $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' - CNET', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content, 'source_id' => $this->_source_id, 'source_url' => $row['url'], 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time()); yii_insert_row('archive', $values, 'db_crawler'); $this->crawlerSuccess($row); }
public function actionDetail() { $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1"; $row = $this->db_crawler->createCommand($query)->queryRow(); if (empty($row)) { die('het roi'); } $url = $row['url']; $response = fectchContent($url); $html = str_get_html($response); //div.article-main-body:first //.col-8:first //thumbnail $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content); //lay thong tin gallery $gallery = array(); $html_gallery = $html->find('.slideshowify', 0); if (!empty($html_gallery)) { $imgs = $html_gallery->find('img'); foreach ($imgs as $img) { $attr = 'data-full-size-image'; $gallery['images'][] = trim($img->{$attr}); } } //content $content = $html->find('div.article-entry', 0); if (empty($content)) { echo '<h1>Error</h1>'; $this->crawlerSuccess($row, 2); die; } $remove_elements = array('.byline', '.inset-ad', '.inset-sm', '.social-share', '.slideshowify', '.aside-related-articles', '.native-ad-mobile', 'script', 'small'); foreach ($content->find(implode(', ', $remove_elements)) as $item) { $item->outertext = ''; } //cancel link in content $links = $content->find('a'); foreach ($links as $item) { $href = trim($item->href); if (empty($href) | strpos($href, '#') === 0) { $item->outertext = ''; } else { $inner_text = trim($item->innertext()); $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>'; } } //lay danh sach tags $tags = array(); $html_tags = $html->find('div.accordion a'); if (!empty($html_tags)) { foreach ($html_tags as $item) { $tags[] = trim($item->innertext()); } } else { $html_tags = $html->find('.article-header .tag-item'); if (!empty($html_tags)) { foreach ($html_tags as $item) { $obj = $item->find('a', 0); $tags[] = trim($obj->innertext()); } } } $title = trim($html->find('title', 0)->innertext()); $parent = "/\\s+/ims"; $title = preg_replace($parent, ' ', $title); //meta_keywords //meta itemprop="keywords" $meta_keywords = $html->find('meta[itemprop="keywords"]', 0); if (!empty($meta_keywords)) { $meta_keywords = trim($meta_keywords->content); } $meta_description = $html->find('meta[name="description"]', 0); if (!empty($meta_description)) { $meta_description = trim($meta_description->content); } $content = str_replace('<!-- Begin: Wordpress Article Content -->', '', $content->outertext); $content = str_replace('<!-- End: Wordpress Article Content -->', '', $content); $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | TechCrunch', '', $title), 'thumbnail' => $thumbnail, 'tags' => json_encode($tags), 'content' => $content, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'gallery' => json_encode($gallery), 'created' => time()); echo "<pre>" . print_r($row, true) . "</pre>"; yii_insert_row('archive', $values, 'db_crawler'); $this->crawlerSuccess($row); }
public function actionDetail() { $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1"; $row = $this->db_crawler->createCommand($query)->queryRow(); if (empty($row)) { die('het roi'); } echo "<pre>" . print_r($row, true) . "</pre>"; $url = $row['url']; $response = fectchContent($url); $html = str_get_html($response); if (empty($html)) { echo 'Error'; $this->crawlerSuccess($row, 2); die; } //div.article-main-body:first //.col-8:first //thumbnail $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content); $content = $html->find('.article-body', 0); $list_images = array(); // $gallery = $content->find('div.gallery', 0); // if (!empty($gallery)) { // $gallery_href = trim($gallery->find('.imageLinkWrapper', 0)->href); // $list_images = $this->getImages($gallery_href); // } $remove_elements = array(); if (!empty($remove_elements)) { foreach ($content->find(implode(', ', $remove_elements)) as $item) { $item->outertext = ''; } } // $attr = 'data-original'; // $imgs = $content->find('.originalImage img, .imageContainer img'); // if (!empty($imgs)) { // foreach ($imgs as $item) { // $src = trim($item->src); // if (empty($src)) { // $item->src = trim($item->$attr); // $item->$attr = ''; // } // } // } $links = $content->find('a'); foreach ($links as $item) { $href = trim($item->href); if (empty($href) | strpos($href, '#') === 0) { $item->outertext = ''; } else { $inner_text = trim($item->innertext()); $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>'; } } //lay danh sach tags $tags = array(); // $html_tags = $html->find('div.collections-topics-and-tags a'); // if (!empty($html_tags)) { // foreach ($html_tags as $item) { // $tags[] = trim($item->innertext()); // } // } $title = trim($html->find('title', 0)->innertext()); //meta_keywords //meta itemprop="keywords" $meta_keywords = $html->find('meta[itemprop="keywords"]', 0); if (!empty($meta_keywords)) { $meta_keywords = trim($meta_keywords->content); } $meta_description = $html->find('meta[name="description"]', 0); if (!empty($meta_description)) { $meta_description = trim($meta_description->content); } $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | Windows Central', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content->outertext, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time()); yii_insert_row('archive', $values, 'db_crawler'); $this->crawlerSuccess($row); }
public function actionDetail() { $query = "SELECT * FROM tbl_link WHERE source_id = " . $this->_source_id . " AND status = 0 ORDER BY id LIMIT 1"; $row = $this->db_crawler->createCommand($query)->queryRow(); if (empty($row)) { die('het roi'); } echo "<pre>" . print_r($row, true) . "</pre>"; // die; $url = $row['url']; // $url = 'http://techz24.vn/abc.html'; $response = fectchContent($url); $html = str_get_html($response); if (empty($html)) { echo 'Error'; $this->crawlerSuccess($row, 2); die; } //div.article-main-body:first //.col-8:first //thumbnail $thumbnail = trim($html->find('meta[property="og:image"]', 0)->content); $content = $html->find('.article-body', 0); if (empty($content)) { echo 'Error'; $this->crawlerSuccess($row, 2); die; } $list_images = array(); //lay thong tin anh neu co $media = $content->find('.media-gallery', 0); if (!empty($media)) { $attr = 'data-big'; $imgs = $media->find('img'); $list_images['title'] = ''; foreach ($imgs as $img) { $list_images['images'][] = $this->_domain . trim($img->{$attr}); } } $remove_elements = array('script', 'noscript', '.media-gallery', '.devicebox', 'style'); if (!empty($remove_elements)) { foreach ($content->find(implode(', ', $remove_elements)) as $item) { $item->outertext = ''; } } $links = $content->find('a'); foreach ($links as $item) { $href = trim($item->href); if (empty($href) | strpos($href, '#') === 0) { $item->outertext = ''; } else { $inner_text = trim($item->innertext()); $item->outertext = '<strong class="txt-bold">' . $inner_text . '</strong>'; } } //lay danh sach tags $tags = array(); $title = trim($html->find('title', 0)->innertext()); //meta_keywords //meta itemprop="keywords" $meta_keywords = $html->find('meta[itemprop="keywords"]', 0); if (!empty($meta_keywords)) { $meta_keywords = trim($meta_keywords->content); } $meta_description = $html->find('meta[name="description"]', 0); if (!empty($meta_description)) { $meta_description = trim($meta_description->content); } $values = array('parent_id' => $row['parent_id'], 'cate_id' => $row['cate_id'], 'title' => str_replace(' | Android Central', '', $title), 'thumbnail' => $thumbnail, 'gallery' => json_encode($list_images), 'tags' => json_encode($tags), 'content' => $content->outertext, 'source_id' => $this->_source_id, 'source_url' => $url, 'short_text' => $row['short_text'], 'meta_keywords' => $meta_keywords, 'meta_description' => $meta_description, 'created' => time()); yii_insert_row('archive', $values, 'db_crawler'); $this->crawlerSuccess($row); }