예제 #1
  * Creates a new PendingNews model.
  * If creation is successful, the browser will be redirected to the 'view' page.
  * @return mixed
 public function actionCreate()
     $model = new PendingNews();
     if ($model->load(Yii::$app->request->post()) && $model->save()) {
         return $this->redirect(['view', 'id' => $model->id]);
     } else {
         return $this->render('create', ['model' => $model]);
예제 #2
 public function run()
     echo "Try parse `{$this->url}`\n";
     if ($html = PageLoaderComponent::load($this->url)) {
         preg_match('/<meta.*?charset=("?|\\")(.*?)("|\\")/i', $html, $matches);
         if (isset($matches[2])) {
             if ($charset = $matches[2]) {
                 $html = mb_convert_encoding($html, "UTF-8", $charset);
             } else {
                 echo "ERROR ON ENCODING DETECTING";
         } else {
             if ($defaultEncoding = SourcesSettings::findOne(['source_id' => $this->source->id, 'name' => 'default_encoding'])) {
                 $html = mb_convert_encoding($html, "UTF-8", $defaultEncoding->value);
             } else {
                 $html = mb_convert_encoding($html, "UTF-8");
         try {
             $html = $this->stripTagWithContent($html, "script");
             //                    $htmlToDetect = $this->processExcludeElements( $html );
             //                    $content      = $this->tryContentDetect( $htmlToDetect );
             $readability = new Readabillity($this->url);
             if ($readability) {
                 $title = "test title";
                 $title = $this->processTitleStopWords($title);
                 //                        if ( ! $content) {
                 $content = $readability->getContent();
                 //                        die("content: ".$content);
                 //                        }
                 $content = $this->processContentStopWords($content);
                 $content = preg_replace('/\\n/', ' ', $content);
                 //                        $content                    = strip_tags( $content,
                 //                            "<p><div><img><span><br><ul><li><embed><iframe><strong><h1><h2><h3><h4>" );
                 $content = $this->fixUrls($content);
                 $content = $this->processExcludeElements($content);
                 if ($date = $this->processPublishDate($content)) {
                     if (!(date("Y-m-d") == date("Y-m-d", $date))) {
                         throw new Exception("Old post");
                 if ($searchContent = trim(strip_tags($content))) {
                     $searchContent = preg_replace('/\\n/', ' ', $searchContent);
                     if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) {
                         if ($this->pendingNews) {
                             $this->pendingNews->content = $content;
                             $this->pendingNews->search_content = $searchContent;
                             $this->pendingNews->status = PendingNews::STATUS_NEW;
                             if (!$this->pendingNews->thumb_src) {
                                 if ($thumbUrl = $this->detectThumb($html, $content)) {
                                     $this->pendingNews->thumb_src = $thumbUrl;
                             if ($this->pendingNews->save()) {
                                 try {
                                     PendingNews::fillTags($this->pendingNews->search_content, $this->pendingNews->id);
                                 } catch (\Exception $e) {
                                 $mq = new RabbitMQComponent();
                                 $mq->postMessage("compile", "compile", json_encode(["pn_id" => $this->pendingNews->id]));
                                 $this->parserQueue->status = ParserQueue::STATUS_DONE;
                                 return true;
                             } else {
                                 $this->parserQueue->status = ParserQueue::STATUS_FAIL;
                         } else {
                             echo PHP_EOL . "NEWS CREATION" . PHP_EOL;
                             $pn = new PendingNews();
                             $pn->source_id = $this->source->id;
                             $pn->title = $title;
                             $pn->content = $content;
                             $pn->search_content = $searchContent;
                             $pn->status = PendingNews::STATUS_NEW;
                             $pn->group_hash = md5(time());
                             $pn->thumb_src = $this->detectThumb($html, $content);
                             $pn->pq_id = $this->parserQueue->id;
                             $pn->created_at = new \yii\db\Expression("NOW()");
                             if ($pn->save()) {
                                 $this->parserQueue->status = ParserQueue::STATUS_DONE;
                                 return true;
                             } else {
                                 echo PHP_EOL . "ERROR" . PHP_EOL;
                                 $this->parserQueue->status = ParserQueue::STATUS_FAIL;
                     } else {
                         $this->parserQueue->status = ParserQueue::STATUS_FAIL;
             } else {
                 throw new Exception('Looks like we couldn\'t find the content. :(');
         } catch (Exception $e) {
             $this->parserQueue->status = ParserQueue::STATUS_FAIL;
     } else {
         $this->parserQueue->status = ParserQueue::STATUS_FAIL;
예제 #3
 public static function add(Sources $source, $title, $content, $image_src, $status = PendingNews::STATUS_NEW, ParserQueue $parser_queue = null, $data = [])
     if ($searchContent = trim(strip_tags($content))) {
         $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent);
         if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) {
             $pn = new PendingNews();
             $pn->source_id = $source->id;
             $pn->title = $title;
             $pn->content = $content;
             $pn->search_content = $searchContent;
             $pn->status = $status;
             $pn->group_hash = md5(microtime());
             $pn->thumb_src = $image_src;
             if ($parser_queue) {
                 $pn->pq_id = $parser_queue->id;
             if (!empty($data)) {
                 $pn->additonal_data = json_encode($data);
             $pn->created_at = new \yii\db\Expression("NOW()");
             $pn->update_at = new \yii\db\Expression("NOW()");
             if ($pn->save()) {
                 if ($parser_queue) {
                     $parser_queue->status = ParserQueue::STATUS_DONE;
             } else {
                 if ($parser_queue) {
                     $parser_queue->status = ParserQueue::STATUS_FAIL;
예제 #4
 private function processCombine($xpath)
     $itemPatterns = SourcesSettings::findAll(['source_id' => $this->source->source_id, 'name' => 'rss_news_item_pattern']);
     if ($titlePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_title'])) {
         $titlePattern = $titlePattern->value;
     if ($contentPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_content'])) {
         $contentPattern = $contentPattern->value;
     if ($linkPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_link'])) {
         $linkPattern = $linkPattern->value;
     if ($imagePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_image'])) {
         $imagePattern = $imagePattern->value;
     if ($categoryPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_category'])) {
         $categoryPattern = $categoryPattern->value;
     foreach ($itemPatterns as $pattern) {
         if ($newsList = $xpath->query($pattern->value)) {
             for ($i = 0; $i < $newsList->length; $i++) {
                 $news = $newsList->item($i);
                 $newsParams = array();
                 $newsParams['data'] = [];
                 if ($this->source->category_id) {
                     $newsParams['data']['category_id'] = $this->source->category_id;
                 $newsParams['source'] = $this->source->source;
                 foreach ($news->childNodes as $node) {
                     if ($titlePattern == $node->nodeName) {
                         $newsParams['title'] = $node->nodeValue;
                     if ($categoryPattern) {
                         if ($categoryPattern == $node->nodeName) {
                             $newsParams['data']['category'] = $node->nodeValue;
                     } else {
                         if ("category" == strtolower($node->nodeName)) {
                             $newsParams['data']['category'] = $node->nodeValue;
                     if ($linkPattern == $node->nodeName) {
                         $newsParams['link'] = str_replace($this->source->source->url, '', $node->nodeValue);
                         //                            $newsParams['link'] = $node->nodeValue;
                     if ($imagePattern == $node->nodeName) {
                         if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) {
                             $newsParams['image_src'] = $images[1][0];
                     if (!isset($newsParams['image_src'])) {
                         if ($node->nodeName == 'enclosure') {
                             if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) {
                                 $newsParams['image_src'] = $images[1][0];
                 try {
                     if (array_key_exists("link", $newsParams)) {
                         $pqItem = new ParserQueue();
                         $pqItem->source_id = $this->source->source_id;
                         $pqItem->url = $newsParams['link'];
                         $pqItem->status = ParserQueue::STATUS_INPROCESS;
                         $pqItem->created_at = new \yii\db\Expression('NOW()');
                         $pqItem->updated_at = new \yii\db\Expression('NOW()');
                         if ($pqItem->save()) {
                             $pn = new PendingNews();
                             $pn->content = '&nbsp;';
                             $pn->search_content = '&nbsp;';
                             $pn->source_id = $this->source->source_id;
                             if (isset($newsParams['title'])) {
                                 $pn->title = $newsParams['title'];
                             $pn->status = PendingNews::STATUS_SUSPENDED;
                             $pn->group_hash = md5(time());
                             if (isset($newsParams['image_src'])) {
                                 $pn->thumb_src = $newsParams['image_src'];
                             if ($pqItem) {
                                 $pn->pq_id = $pqItem->id;
                             if (!empty($newsParams['data'])) {
                                 $pn->additonal_data = json_encode($newsParams['data']);
                             $pn->created_at = new \yii\db\Expression("NOW()");
                             $pn->update_at = new \yii\db\Expression("NOW()");
                             if ($pn->save()) {
                                 if ($pqItem) {
                                     $pqItem->status = ParserQueue::STATUS_DONE;
                                     $this->mq->postMessage("parse", "parse_rss", json_encode(["pn_id" => $pn->id, "pq_id" => $pqItem->id]));
                             } else {
                                 if ($pqItem) {
                                     $pqItem->status = ParserQueue::STATUS_FAIL;
                         } else {
                             //                                    print_r( $pqItem->errors );
                 } catch (\yii\db\Exception $e) {
                     //                            print_r( $e->getMessage() );