Esempio n. 1
0
 public function actionTest()
 {
     $data = [];
     if (Yii::$app->request->post()) {
         $data = Yii::$app->request->post();
         if (isset($data['url'])) {
             $parser = new NewsParserComponent(ParserQueue::findOne(1));
             $data['parserResult'] = $parser->parse(PageLoaderComponent::load($data['url']), $data['url'], Sources::findOne($data['Source']));
         }
     }
     return $this->render('test', $data);
 }
Esempio n. 2
0
 public function actionConvert()
 {
     $url = 'http://fr.ill.in.ua/rss/ru/all.xml';
     echo PHP_EOL . "TRY LOAD {$url}" . PHP_EOL;
     $data = PageLoaderComponent::load($url);
     preg_match('/<\\?xml.*?encoding=(\'|")(.*)("|\\")/i', $data, $matches);
     print_r($matches);
     if (isset($matches[2])) {
         $charset = $matches[2];
         $data = mb_convert_encoding($data, "UTF-8", $charset);
     } else {
         $data = mb_convert_encoding($data, "UTF-8");
     }
     print_r($data);
 }
 public function run()
 {
     if ($rss = PageLoaderComponent::load($this->source->url)) {
         $doc = new \DOMDocument();
         $doc->preserveWhiteSpace = false;
         libxml_use_internal_errors(true);
         $doc->loadXML($rss);
         $xpath = new \DOMXpath($doc);
         if ($this->source->is_combine) {
             $this->processCombine($xpath);
         } else {
             $this->parseLinks($xpath);
         }
     }
 }
Esempio n. 4
0
 public static function processMessage($msg)
 {
     $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']);
     $params = json_decode($msg->body);
     print_r($params);
     try {
         $news = News::findOne(['id' => $params->news_id]);
         $dirPath = Yii::getAlias('@frontend') . '/web/uploads/' . date("Y", strtotime($news->created_at)) . '/' . date("m", strtotime($news->created_at)) . "/" . date("d", strtotime($news->created_at)) . "/" . $params->news_id . "/";
         if (!file_exists($dirPath)) {
             mkdir($dirPath, 0777, true);
         }
         if ($tmpFile = PageLoaderComponent::loadFile($params->src)) {
             $originFile = $dirPath . "origin";
             copy($tmpFile, $originFile);
             unlink($tmpFile);
             echo "Origin file: {$originFile}" . PHP_EOL;
             if ($handle = @fopen($originFile, 'r')) {
                 try {
                     if (file_exists($originFile) && ($data = @getimagesize($originFile))) {
                         $image = new ImageEditor();
                         $image->load($originFile);
                         foreach (Yii::$app->params['image_sizes'] as $title => $size) {
                             $image->softThumb($size['width'], $size['height'], $dirPath . $title . ".png");
                         }
                     } else {
                         throw new \Exception("Origin file isn't an image", 400);
                     }
                 } catch (Exception $e) {
                     echo "Error: {$e->getMessage()}" . PHP_EOL;
                 }
                 fclose($handle);
             } else {
                 throw new \Exception("Can't open origin file", 400);
             }
         }
     } catch (\Exception $e) {
         echo $e->getMessage();
     }
 }
Esempio n. 5
0
 public function run()
 {
     echo "Try parse `{$this->url}`\n";
     if ($html = PageLoaderComponent::load($this->url)) {
         preg_match('/<meta.*?charset=("?|\\")(.*?)("|\\")/i', $html, $matches);
         if (isset($matches[2])) {
             if ($charset = $matches[2]) {
                 $html = mb_convert_encoding($html, "UTF-8", $charset);
             } else {
                 echo "ERROR ON ENCODING DETECTING";
             }
         } else {
             if ($defaultEncoding = SourcesSettings::findOne(['source_id' => $this->source->id, 'name' => 'default_encoding'])) {
                 $html = mb_convert_encoding($html, "UTF-8", $defaultEncoding->value);
             } else {
                 $html = mb_convert_encoding($html, "UTF-8");
             }
         }
         try {
             $html = $this->stripTagWithContent($html, "script");
             //                    $htmlToDetect = $this->processExcludeElements( $html );
             //                    $content      = $this->tryContentDetect( $htmlToDetect );
             $readability = new Readabillity($this->url);
             if ($readability) {
                 $title = "test title";
                 $title = $this->processTitleStopWords($title);
                 //                        if ( ! $content) {
                 $content = $readability->getContent();
                 //                        die("content: ".$content);
                 //                        }
                 $content = $this->processContentStopWords($content);
                 $content = preg_replace('/\\n/', ' ', $content);
                 //                        $content                    = strip_tags( $content,
                 //                            "<p><div><img><span><br><ul><li><embed><iframe><strong><h1><h2><h3><h4>" );
                 $content = $this->fixUrls($content);
                 $content = $this->processExcludeElements($content);
                 if ($date = $this->processPublishDate($content)) {
                     if (!(date("Y-m-d") == date("Y-m-d", $date))) {
                         throw new Exception("Old post");
                     }
                 }
                 if ($searchContent = trim(strip_tags($content))) {
                     $searchContent = preg_replace('/\\n/', ' ', $searchContent);
                     if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) {
                         if ($this->pendingNews) {
                             $this->pendingNews->content = $content;
                             $this->pendingNews->search_content = $searchContent;
                             $this->pendingNews->status = PendingNews::STATUS_NEW;
                             if (!$this->pendingNews->thumb_src) {
                                 if ($thumbUrl = $this->detectThumb($html, $content)) {
                                     $this->pendingNews->thumb_src = $thumbUrl;
                                 }
                             }
                             if ($this->pendingNews->save()) {
                                 try {
                                     PendingNews::fillTags($this->pendingNews->search_content, $this->pendingNews->id);
                                 } catch (\Exception $e) {
                                     print_r($e->getMessage());
                                 }
                                 $mq = new RabbitMQComponent();
                                 $mq->postMessage("compile", "compile", json_encode(["pn_id" => $this->pendingNews->id]));
                                 $this->parserQueue->status = ParserQueue::STATUS_DONE;
                                 $this->parserQueue->save();
                                 return true;
                             } else {
                                 print_r($this->pendingNews->getErrors());
                                 $this->parserQueue->status = ParserQueue::STATUS_FAIL;
                                 $this->parserQueue->save();
                             }
                         } else {
                             echo PHP_EOL . "NEWS CREATION" . PHP_EOL;
                             $pn = new PendingNews();
                             $pn->source_id = $this->source->id;
                             $pn->title = $title;
                             $pn->content = $content;
                             $pn->search_content = $searchContent;
                             $pn->status = PendingNews::STATUS_NEW;
                             $pn->group_hash = md5(time());
                             $pn->thumb_src = $this->detectThumb($html, $content);
                             $pn->pq_id = $this->parserQueue->id;
                             $pn->created_at = new \yii\db\Expression("NOW()");
                             if ($pn->save()) {
                                 $this->parserQueue->status = ParserQueue::STATUS_DONE;
                                 $this->parserQueue->save();
                                 return true;
                             } else {
                                 echo PHP_EOL . "ERROR" . PHP_EOL;
                                 print_r($pn->getErrors());
                                 $this->parserQueue->status = ParserQueue::STATUS_FAIL;
                                 $this->parserQueue->save();
                             }
                         }
                     } else {
                         $this->parserQueue->status = ParserQueue::STATUS_FAIL;
                         $this->parserQueue->save();
                     }
                 }
             } else {
                 throw new Exception('Looks like we couldn\'t find the content. :(');
             }
         } catch (Exception $e) {
             print_r($e);
             $this->parserQueue->status = ParserQueue::STATUS_FAIL;
             $this->parserQueue->save();
         }
     } else {
         $this->parserQueue->status = ParserQueue::STATUS_FAIL;
         $this->parserQueue->save();
     }
 }
Esempio n. 6
0
 protected function createNews(PendingNews $pn)
 {
     if ($npn = Npn::findOne(["pending_news_id" => $pn->id])) {
         return $npn->news_id;
     }
     $news = new News();
     $news->title = NewsParserComponent::replace4byte($pn->title);
     $news->thumb = $pn->thumb_src;
     $news->status = "in_process";
     $news->cnt = 0;
     $news->created_at = new \yii\db\Expression('NOW()');
     $news->updated_at = new \yii\db\Expression('NOW()');
     if ($news->save()) {
         $this->linkNews($news->id, $pn->id);
         $news->status = "done";
         $news->save();
         if ($news->thumb && PageLoaderComponent::checkRemoteFile($news->thumb)) {
             $mq = new RabbitMQComponent();
             $mq->postMessage("image", "image", json_encode(["news_id" => $news->id, "src" => $news->thumb]));
             $mq->postMessage("twitter", "twitter", json_encode(["news_id" => $news->id, "src" => $news->thumb]));
         } else {
             if ($giData = PageLoaderComponent::load("https://ajax.googleapis.com/ajax/services/search/images?v=1.0&q=" . urlencode($news->title) . "&userip=127.0.0.1&imgsz=large")) {
                 $data = json_decode($giData);
                 if (isset($data->responseData->results[0])) {
                     $news->thumb = $data->responseData->results[0]->unescapedUrl;
                     $news->save();
                     $mq = new RabbitMQComponent();
                     $mq->postMessage("image", "image", json_encode(["news_id" => $news->id, "src" => $data->responseData->results[0]->unescapedUrl]));
                     $mq->postMessage("twitter", "twitter", json_encode(["news_id" => $news->id, "src" => $data->responseData->results[0]->unescapedUrl]));
                 }
             }
         }
     } else {
         print_r($news->getErrors());
     }
     return $news->id;
 }
Esempio n. 7
0
 public function run()
 {
     $rss = PageLoaderComponent::load($this->source->url);
     $rss = trim($rss);
     $rss = preg_replace("/<rss([^>]+)>/mi", "<rss>", $rss);
     preg_match('/<\\?xml.*?encoding=(\'|")(.*)("|\\")/i', $rss, $matches);
     $charset = "utf-8";
     if (isset($matches[2])) {
         $charset = $matches[2];
         //                $rss     = mb_convert_encoding( $rss, "UTF-8", $charset );
     } else {
         //                $rss = mb_convert_encoding( $rss, "UTF-8" );
     }
     $doc = new \DOMDocument("1.1", $charset);
     $doc->preserveWhiteSpace = false;
     libxml_use_internal_errors(true);
     $doc->loadXML($rss);
     $xpath = new \DOMXpath($doc);
     foreach ($this->itemPatterns as $pattern) {
         if ($newsList = $xpath->query($pattern->value)) {
             for ($i = 0; $i < $newsList->length; $i++) {
                 $news = $newsList->item($i);
                 $newsParams = array();
                 $newsParams['data'] = [];
                 if ($this->source->category_id) {
                     $newsParams['data']['category_id'] = $this->source->category_id;
                 }
                 $newsParams['source'] = $this->source->source;
                 foreach ($news->childNodes as $node) {
                     if ($this->titlePattern == $node->nodeName) {
                         $newsParams['title'] = $node->nodeValue;
                     }
                     if ($this->contentPattern == $node->nodeName) {
                         $newsParams['content'] = $node->nodeValue;
                     }
                     if ($this->linkPattern == $node->nodeName) {
                         $newsParams['link'] = $node->nodeValue;
                     }
                     if ($this->categoryPattern == $node->nodeName) {
                         $newsParams['data']['category'] = $node->nodeValue;
                     }
                     if ($this->imagePattern == $node->nodeName) {
                         if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) {
                             $newsParams['image_src'] = $images[1][0];
                         }
                     }
                     if (!isset($newsParams['image_src'])) {
                         if ($node->nodeName == 'enclosure') {
                             if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) {
                                 $newsParams['image_src'] = $images[1][0];
                             }
                         }
                     }
                     if (!isset($newsParams['image_src'])) {
                         $newsParams['image_src'] = "";
                     }
                 }
                 try {
                     $pqItem = ParserQueue::findOne(['url' => $newsParams['link']]);
                     if (!$pqItem) {
                         $pqItem = new ParserQueue();
                         $pqItem->source_id = $this->source->source_id;
                         $pqItem->url = $newsParams['link'];
                         $pqItem->status = ParserQueue::STATUS_INPROCESS;
                         $pqItem->created_at = new \yii\db\Expression('NOW()');
                         $pqItem->updated_at = new \yii\db\Expression('NOW()');
                         if ($pqItem->save()) {
                             PendingNews::add($newsParams['source'], $newsParams['title'], $newsParams['content'], isset($newsParams['image_src']) ? $newsParams['image_src'] : false, PendingNews::STATUS_NEW, $pqItem, $newsParams['data']);
                         }
                     }
                 } catch (Exception $e) {
                     //                            print_r( $e->getMessage() );
                 }
             }
         }
     }
 }