Пример #1
0
 /**
  * @param ParserQueue $pq
  * @param PendingNews $pn
  */
 public function __construct(ParserQueue $pq, PendingNews $pn = null)
 {
     $pq->status = ParserQueue::STATUS_INPROCESS;
     $pq->save();
     $this->source = $pq->source;
     $this->url = $this->prepareUrl($pq->url);
     $this->parserQueue = $pq;
     if ($pn) {
         $this->pendingNews = $pn;
     }
 }
Пример #2
0
 public function actionCheck()
 {
     $pq = new ParserQueue();
     $pq->source_id = 1;
     $pq->url = 'http://test.loc';
     $pq->status = 'new';
     $pq->created_at = new Expression("NOW()");
     $pq->updated_at = new Expression("NOW()");
     if ($pq->insert()) {
         echo "\nSAVED with id {$pq->id}\n";
     } else {
         echo "\nFAILED\n";
         var_dump($pq->getErrors());
     }
 }
Пример #3
0
 public function actionTest()
 {
     $data = [];
     if (Yii::$app->request->post()) {
         $data = Yii::$app->request->post();
         if (isset($data['url'])) {
             $parser = new NewsParserComponent(ParserQueue::findOne(1));
             $data['parserResult'] = $parser->parse(PageLoaderComponent::load($data['url']), $data['url'], Sources::findOne($data['Source']));
         }
     }
     return $this->render('test', $data);
 }
Пример #4
0
 public static function processMessage($msg)
 {
     //            print_r($msg);
     try {
         $params = json_decode($msg->body);
         print_r($params);
         $pqItem = ParserQueue::findOne(["id" => $params->pq_id]);
         $pnItem = PendingNews::findOne(["id" => $params->pn_id]);
         if ($pqItem && $pnItem) {
             $newsParser = new NewsParserComponent($pqItem, $pnItem);
             $newsParser->run();
         }
     } catch (Exception $e) {
         echo $e->getMessage();
         if ($e->getCode() != 505) {
             $mq = new RabbitMQComponent();
             $mq->postMessage("parse", "parse_rss", $msg->body);
         }
     }
     $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']);
     //            die();
 }
Пример #5
0
 public static function add(Sources $source, $title, $content, $image_src, $status = PendingNews::STATUS_NEW, ParserQueue $parser_queue = null, $data = [])
 {
     if ($searchContent = trim(strip_tags($content))) {
         $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent);
         if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) {
             $pn = new PendingNews();
             $pn->source_id = $source->id;
             $pn->title = $title;
             $pn->content = $content;
             $pn->search_content = $searchContent;
             $pn->status = $status;
             $pn->group_hash = md5(microtime());
             $pn->thumb_src = $image_src;
             if ($parser_queue) {
                 $pn->pq_id = $parser_queue->id;
             }
             if (!empty($data)) {
                 $pn->additonal_data = json_encode($data);
             }
             $pn->created_at = new \yii\db\Expression("NOW()");
             $pn->update_at = new \yii\db\Expression("NOW()");
             if ($pn->save()) {
                 if ($parser_queue) {
                     $parser_queue->status = ParserQueue::STATUS_DONE;
                     $parser_queue->save();
                 }
             } else {
                 if ($parser_queue) {
                     $parser_queue->status = ParserQueue::STATUS_FAIL;
                     $parser_queue->save();
                 }
             }
         }
     }
 }
Пример #6
0
 public function run()
 {
     $rss = PageLoaderComponent::load($this->source->url);
     $rss = trim($rss);
     $rss = preg_replace("/<rss([^>]+)>/mi", "<rss>", $rss);
     preg_match('/<\\?xml.*?encoding=(\'|")(.*)("|\\")/i', $rss, $matches);
     $charset = "utf-8";
     if (isset($matches[2])) {
         $charset = $matches[2];
         //                $rss     = mb_convert_encoding( $rss, "UTF-8", $charset );
     } else {
         //                $rss = mb_convert_encoding( $rss, "UTF-8" );
     }
     $doc = new \DOMDocument("1.1", $charset);
     $doc->preserveWhiteSpace = false;
     libxml_use_internal_errors(true);
     $doc->loadXML($rss);
     $xpath = new \DOMXpath($doc);
     foreach ($this->itemPatterns as $pattern) {
         if ($newsList = $xpath->query($pattern->value)) {
             for ($i = 0; $i < $newsList->length; $i++) {
                 $news = $newsList->item($i);
                 $newsParams = array();
                 $newsParams['data'] = [];
                 if ($this->source->category_id) {
                     $newsParams['data']['category_id'] = $this->source->category_id;
                 }
                 $newsParams['source'] = $this->source->source;
                 foreach ($news->childNodes as $node) {
                     if ($this->titlePattern == $node->nodeName) {
                         $newsParams['title'] = $node->nodeValue;
                     }
                     if ($this->contentPattern == $node->nodeName) {
                         $newsParams['content'] = $node->nodeValue;
                     }
                     if ($this->linkPattern == $node->nodeName) {
                         $newsParams['link'] = $node->nodeValue;
                     }
                     if ($this->categoryPattern == $node->nodeName) {
                         $newsParams['data']['category'] = $node->nodeValue;
                     }
                     if ($this->imagePattern == $node->nodeName) {
                         if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) {
                             $newsParams['image_src'] = $images[1][0];
                         }
                     }
                     if (!isset($newsParams['image_src'])) {
                         if ($node->nodeName == 'enclosure') {
                             if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) {
                                 $newsParams['image_src'] = $images[1][0];
                             }
                         }
                     }
                     if (!isset($newsParams['image_src'])) {
                         $newsParams['image_src'] = "";
                     }
                 }
                 try {
                     $pqItem = ParserQueue::findOne(['url' => $newsParams['link']]);
                     if (!$pqItem) {
                         $pqItem = new ParserQueue();
                         $pqItem->source_id = $this->source->source_id;
                         $pqItem->url = $newsParams['link'];
                         $pqItem->status = ParserQueue::STATUS_INPROCESS;
                         $pqItem->created_at = new \yii\db\Expression('NOW()');
                         $pqItem->updated_at = new \yii\db\Expression('NOW()');
                         if ($pqItem->save()) {
                             PendingNews::add($newsParams['source'], $newsParams['title'], $newsParams['content'], isset($newsParams['image_src']) ? $newsParams['image_src'] : false, PendingNews::STATUS_NEW, $pqItem, $newsParams['data']);
                         }
                     }
                 } catch (Exception $e) {
                     //                            print_r( $e->getMessage() );
                 }
             }
         }
     }
 }
Пример #7
0
 /**
  * @return \yii\db\ActiveQuery
  */
 public function getParserQueues()
 {
     return $this->hasMany(ParserQueue::className(), ['source_id' => 'id']);
 }
Пример #8
0
 private function processCombine($xpath)
 {
     $itemPatterns = SourcesSettings::findAll(['source_id' => $this->source->source_id, 'name' => 'rss_news_item_pattern']);
     if ($titlePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_title'])) {
         $titlePattern = $titlePattern->value;
     }
     if ($contentPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_content'])) {
         $contentPattern = $contentPattern->value;
     }
     if ($linkPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_link'])) {
         $linkPattern = $linkPattern->value;
     }
     if ($imagePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_image'])) {
         $imagePattern = $imagePattern->value;
     }
     if ($categoryPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_category'])) {
         $categoryPattern = $categoryPattern->value;
     }
     foreach ($itemPatterns as $pattern) {
         if ($newsList = $xpath->query($pattern->value)) {
             for ($i = 0; $i < $newsList->length; $i++) {
                 $news = $newsList->item($i);
                 $newsParams = array();
                 $newsParams['data'] = [];
                 if ($this->source->category_id) {
                     $newsParams['data']['category_id'] = $this->source->category_id;
                 }
                 $newsParams['source'] = $this->source->source;
                 foreach ($news->childNodes as $node) {
                     if ($titlePattern == $node->nodeName) {
                         $newsParams['title'] = $node->nodeValue;
                     }
                     if ($categoryPattern) {
                         if ($categoryPattern == $node->nodeName) {
                             $newsParams['data']['category'] = $node->nodeValue;
                         }
                     } else {
                         if ("category" == strtolower($node->nodeName)) {
                             $newsParams['data']['category'] = $node->nodeValue;
                         }
                     }
                     if ($linkPattern == $node->nodeName) {
                         $newsParams['link'] = str_replace($this->source->source->url, '', $node->nodeValue);
                         //                            $newsParams['link'] = $node->nodeValue;
                     }
                     if ($imagePattern == $node->nodeName) {
                         if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) {
                             $newsParams['image_src'] = $images[1][0];
                         }
                     }
                     if (!isset($newsParams['image_src'])) {
                         if ($node->nodeName == 'enclosure') {
                             if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) {
                                 $newsParams['image_src'] = $images[1][0];
                             }
                         }
                     }
                 }
                 try {
                     if (array_key_exists("link", $newsParams)) {
                         $pqItem = new ParserQueue();
                         $pqItem->source_id = $this->source->source_id;
                         $pqItem->url = $newsParams['link'];
                         $pqItem->status = ParserQueue::STATUS_INPROCESS;
                         $pqItem->created_at = new \yii\db\Expression('NOW()');
                         $pqItem->updated_at = new \yii\db\Expression('NOW()');
                         if ($pqItem->save()) {
                             $pn = new PendingNews();
                             $pn->content = '&nbsp;';
                             $pn->search_content = '&nbsp;';
                             $pn->source_id = $this->source->source_id;
                             if (isset($newsParams['title'])) {
                                 $pn->title = $newsParams['title'];
                             }
                             $pn->status = PendingNews::STATUS_SUSPENDED;
                             $pn->group_hash = md5(time());
                             if (isset($newsParams['image_src'])) {
                                 $pn->thumb_src = $newsParams['image_src'];
                             }
                             if ($pqItem) {
                                 $pn->pq_id = $pqItem->id;
                             }
                             if (!empty($newsParams['data'])) {
                                 $pn->additonal_data = json_encode($newsParams['data']);
                             }
                             $pn->created_at = new \yii\db\Expression("NOW()");
                             $pn->update_at = new \yii\db\Expression("NOW()");
                             if ($pn->save()) {
                                 if ($pqItem) {
                                     $pqItem->status = ParserQueue::STATUS_DONE;
                                     $pqItem->save();
                                     $this->mq->postMessage("parse", "parse_rss", json_encode(["pn_id" => $pn->id, "pq_id" => $pqItem->id]));
                                 }
                             } else {
                                 if ($pqItem) {
                                     $pqItem->status = ParserQueue::STATUS_FAIL;
                                     $pqItem->save();
                                 }
                             }
                         } else {
                             //                                    print_r( $pqItem->errors );
                         }
                     }
                 } catch (\yii\db\Exception $e) {
                     //                            print_r( $e->getMessage() );
                 }
             }
         }
     }
 }