public function actionIndex() { foreach (PendingNews::find()->where("id <= 4293")->each() as $pn) { if ($pn->search_content) { $mq = new RabbitMQComponent(); $mq->postMessage("compile", "compile", json_encode(["pn_id" => $pn->id])); } } }
/** * Creates data provider instance with search query applied * * @param array $params * * @return ActiveDataProvider */ public function search($params) { $query = PendingNews::find(); $dataProvider = new ActiveDataProvider(['query' => $query]); if ($this->load($params) && !$this->validate()) { return $dataProvider; } $query->andFilterWhere(['id' => $this->id, 'source_id' => $this->source_id, 'pq_id' => $this->pq_id, 'processed' => $this->processed, 'created_at' => $this->created_at, 'update_at' => $this->update_at]); $query->andFilterWhere(['like', 'title', $this->title])->andFilterWhere(['like', 'content', $this->content])->andFilterWhere(['like', 'search_content', $this->search_content])->andFilterWhere(['like', 'thumb_src', $this->thumb_src])->andFilterWhere(['like', 'status', $this->status])->andFilterWhere(['like', 'group_hash', $this->group_hash]); return $dataProvider; }
public static function processMessage($msg) { $params = json_decode($msg->body); print_r($params); try { if ($pn = PendingNews::findOne(['id' => $params->pn_id])) { $detector = new SimilarDetectComponent($pn); $detector->detect(); } } catch (Exception $e) { echo $e->getMessage(); } $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']); }
public static function processMessage($msg) { // print_r($msg); try { $params = json_decode($msg->body); print_r($params); $pqItem = ParserQueue::findOne(["id" => $params->pq_id]); $pnItem = PendingNews::findOne(["id" => $params->pn_id]); if ($pqItem && $pnItem) { $newsParser = new NewsParserComponent($pqItem, $pnItem); $newsParser->run(); } } catch (Exception $e) { echo $e->getMessage(); if ($e->getCode() != 505) { $mq = new RabbitMQComponent(); $mq->postMessage("parse", "parse_rss", $msg->body); } } $msg->delivery_info['channel']->basic_ack($msg->delivery_info['delivery_tag']); // die(); }
public function run() { echo "Try parse `{$this->url}`\n"; if ($html = PageLoaderComponent::load($this->url)) { preg_match('/<meta.*?charset=("?|\\")(.*?)("|\\")/i', $html, $matches); if (isset($matches[2])) { if ($charset = $matches[2]) { $html = mb_convert_encoding($html, "UTF-8", $charset); } else { echo "ERROR ON ENCODING DETECTING"; } } else { if ($defaultEncoding = SourcesSettings::findOne(['source_id' => $this->source->id, 'name' => 'default_encoding'])) { $html = mb_convert_encoding($html, "UTF-8", $defaultEncoding->value); } else { $html = mb_convert_encoding($html, "UTF-8"); } } try { $html = $this->stripTagWithContent($html, "script"); // $htmlToDetect = $this->processExcludeElements( $html ); // $content = $this->tryContentDetect( $htmlToDetect ); $readability = new Readabillity($this->url); if ($readability) { $title = "test title"; $title = $this->processTitleStopWords($title); // if ( ! $content) { $content = $readability->getContent(); // die("content: ".$content); // } $content = $this->processContentStopWords($content); $content = preg_replace('/\\n/', ' ', $content); // $content = strip_tags( $content, // "<p><div><img><span><br><ul><li><embed><iframe><strong><h1><h2><h3><h4>" ); $content = $this->fixUrls($content); $content = $this->processExcludeElements($content); if ($date = $this->processPublishDate($content)) { if (!(date("Y-m-d") == date("Y-m-d", $date))) { throw new Exception("Old post"); } } if ($searchContent = trim(strip_tags($content))) { $searchContent = preg_replace('/\\n/', ' ', $searchContent); if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) { if ($this->pendingNews) { $this->pendingNews->content = $content; $this->pendingNews->search_content = $searchContent; $this->pendingNews->status = PendingNews::STATUS_NEW; if (!$this->pendingNews->thumb_src) { if ($thumbUrl = $this->detectThumb($html, $content)) { $this->pendingNews->thumb_src = $thumbUrl; } } if ($this->pendingNews->save()) { try { PendingNews::fillTags($this->pendingNews->search_content, $this->pendingNews->id); } catch (\Exception $e) { print_r($e->getMessage()); } $mq = new RabbitMQComponent(); $mq->postMessage("compile", "compile", json_encode(["pn_id" => $this->pendingNews->id])); $this->parserQueue->status = ParserQueue::STATUS_DONE; $this->parserQueue->save(); return true; } else { print_r($this->pendingNews->getErrors()); $this->parserQueue->status = ParserQueue::STATUS_FAIL; $this->parserQueue->save(); } } else { echo PHP_EOL . "NEWS CREATION" . PHP_EOL; $pn = new PendingNews(); $pn->source_id = $this->source->id; $pn->title = $title; $pn->content = $content; $pn->search_content = $searchContent; $pn->status = PendingNews::STATUS_NEW; $pn->group_hash = md5(time()); $pn->thumb_src = $this->detectThumb($html, $content); $pn->pq_id = $this->parserQueue->id; $pn->created_at = new \yii\db\Expression("NOW()"); if ($pn->save()) { $this->parserQueue->status = ParserQueue::STATUS_DONE; $this->parserQueue->save(); return true; } else { echo PHP_EOL . "ERROR" . PHP_EOL; print_r($pn->getErrors()); $this->parserQueue->status = ParserQueue::STATUS_FAIL; $this->parserQueue->save(); } } } else { $this->parserQueue->status = ParserQueue::STATUS_FAIL; $this->parserQueue->save(); } } } else { throw new Exception('Looks like we couldn\'t find the content. :('); } } catch (Exception $e) { print_r($e); $this->parserQueue->status = ParserQueue::STATUS_FAIL; $this->parserQueue->save(); } } else { $this->parserQueue->status = ParserQueue::STATUS_FAIL; $this->parserQueue->save(); } }
/** * @return \yii\db\ActiveQuery */ public function getPendingNews() { return $this->hasMany(PendingNews::className(), ['pq_id' => 'id']); }
/** * Finds the PendingNews model based on its primary key value. * If the model is not found, a 404 HTTP exception will be thrown. * * @param integer $id * * @return PendingNews the loaded model * @throws NotFoundHttpException if the model cannot be found */ protected function findModel($id) { if (($model = PendingNews::findOne($id)) !== null) { return $model; } else { throw new NotFoundHttpException('The requested page does not exist.'); } }
public static function add(Sources $source, $title, $content, $image_src, $status = PendingNews::STATUS_NEW, ParserQueue $parser_queue = null, $data = []) { if ($searchContent = trim(strip_tags($content))) { $searchContent = preg_replace("/[^а-яa-z ]/ui", "", $searchContent); if (count(explode(" ", $searchContent)) >= Settings::findOne(['name' => 'news_min_length'])->value) { $pn = new PendingNews(); $pn->source_id = $source->id; $pn->title = $title; $pn->content = $content; $pn->search_content = $searchContent; $pn->status = $status; $pn->group_hash = md5(microtime()); $pn->thumb_src = $image_src; if ($parser_queue) { $pn->pq_id = $parser_queue->id; } if (!empty($data)) { $pn->additonal_data = json_encode($data); } $pn->created_at = new \yii\db\Expression("NOW()"); $pn->update_at = new \yii\db\Expression("NOW()"); if ($pn->save()) { if ($parser_queue) { $parser_queue->status = ParserQueue::STATUS_DONE; $parser_queue->save(); } } else { if ($parser_queue) { $parser_queue->status = ParserQueue::STATUS_FAIL; $parser_queue->save(); } } } } }
protected function detectCategories($news_id, $pn_id) { $pn = PendingNews::findOne($pn_id); $content = mb_strtolower($pn->search_content, 'utf-8'); if ($pn->additonal_data) { $data = json_decode($pn->additonal_data); if (isset($data->category_id)) { if (!($nhc = NewsHasCategory::findOne(['category_id' => $data->category_id, 'news_id' => $news_id]))) { $nhc = new NewsHasCategory(); $nhc->news_id = $news_id; $nhc->category_id = $data->category_id; $nhc->save(); return true; } } if ($data->category) { $content = mb_strtolower($data->category, 'utf-8'); } } $categoryWords = CategoryWords::find()->all(); foreach ($categoryWords as $cw) { if (mb_strpos($content, mb_strtolower($cw->word, 'utf-8'), 0, 'utf-8') !== false) { if (!($nhc = NewsHasCategory::findOne(['category_id' => $cw->category_id, 'news_id' => $news_id]))) { $nhc = new NewsHasCategory(); $nhc->news_id = $news_id; $nhc->category_id = $cw->category_id; $nhc->save(); } } } }
public function run() { $rss = PageLoaderComponent::load($this->source->url); $rss = trim($rss); $rss = preg_replace("/<rss([^>]+)>/mi", "<rss>", $rss); preg_match('/<\\?xml.*?encoding=(\'|")(.*)("|\\")/i', $rss, $matches); $charset = "utf-8"; if (isset($matches[2])) { $charset = $matches[2]; // $rss = mb_convert_encoding( $rss, "UTF-8", $charset ); } else { // $rss = mb_convert_encoding( $rss, "UTF-8" ); } $doc = new \DOMDocument("1.1", $charset); $doc->preserveWhiteSpace = false; libxml_use_internal_errors(true); $doc->loadXML($rss); $xpath = new \DOMXpath($doc); foreach ($this->itemPatterns as $pattern) { if ($newsList = $xpath->query($pattern->value)) { for ($i = 0; $i < $newsList->length; $i++) { $news = $newsList->item($i); $newsParams = array(); $newsParams['data'] = []; if ($this->source->category_id) { $newsParams['data']['category_id'] = $this->source->category_id; } $newsParams['source'] = $this->source->source; foreach ($news->childNodes as $node) { if ($this->titlePattern == $node->nodeName) { $newsParams['title'] = $node->nodeValue; } if ($this->contentPattern == $node->nodeName) { $newsParams['content'] = $node->nodeValue; } if ($this->linkPattern == $node->nodeName) { $newsParams['link'] = $node->nodeValue; } if ($this->categoryPattern == $node->nodeName) { $newsParams['data']['category'] = $node->nodeValue; } if ($this->imagePattern == $node->nodeName) { if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) { $newsParams['image_src'] = $images[1][0]; } } if (!isset($newsParams['image_src'])) { if ($node->nodeName == 'enclosure') { if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) { $newsParams['image_src'] = $images[1][0]; } } } if (!isset($newsParams['image_src'])) { $newsParams['image_src'] = ""; } } try { $pqItem = ParserQueue::findOne(['url' => $newsParams['link']]); if (!$pqItem) { $pqItem = new ParserQueue(); $pqItem->source_id = $this->source->source_id; $pqItem->url = $newsParams['link']; $pqItem->status = ParserQueue::STATUS_INPROCESS; $pqItem->created_at = new \yii\db\Expression('NOW()'); $pqItem->updated_at = new \yii\db\Expression('NOW()'); if ($pqItem->save()) { PendingNews::add($newsParams['source'], $newsParams['title'], $newsParams['content'], isset($newsParams['image_src']) ? $newsParams['image_src'] : false, PendingNews::STATUS_NEW, $pqItem, $newsParams['data']); } } } catch (Exception $e) { // print_r( $e->getMessage() ); } } } } }
public function getShort($length = 150) { $npn = Npn::find()->where(['news_id' => $this->id])->orderBy(['pending_news_id' => SORT_DESC])->limit(1)->one(); if ($pn = PendingNews::findOne($npn->pending_news_id)) { return html_entity_decode(mb_substr($pn->search_content, 0, $length, 'utf-8')); } }
/** * @return \yii\db\ActiveQuery */ public function getPendingNews() { return $this->hasOne(PendingNews::className(), ['id' => 'pending_news_id']); }
private function processCombine($xpath) { $itemPatterns = SourcesSettings::findAll(['source_id' => $this->source->source_id, 'name' => 'rss_news_item_pattern']); if ($titlePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_title'])) { $titlePattern = $titlePattern->value; } if ($contentPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_content'])) { $contentPattern = $contentPattern->value; } if ($linkPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_link'])) { $linkPattern = $linkPattern->value; } if ($imagePattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_image'])) { $imagePattern = $imagePattern->value; } if ($categoryPattern = SourcesSettings::findOne(['source_id' => $this->source->source_id, 'name' => 'rss_category'])) { $categoryPattern = $categoryPattern->value; } foreach ($itemPatterns as $pattern) { if ($newsList = $xpath->query($pattern->value)) { for ($i = 0; $i < $newsList->length; $i++) { $news = $newsList->item($i); $newsParams = array(); $newsParams['data'] = []; if ($this->source->category_id) { $newsParams['data']['category_id'] = $this->source->category_id; } $newsParams['source'] = $this->source->source; foreach ($news->childNodes as $node) { if ($titlePattern == $node->nodeName) { $newsParams['title'] = $node->nodeValue; } if ($categoryPattern) { if ($categoryPattern == $node->nodeName) { $newsParams['data']['category'] = $node->nodeValue; } } else { if ("category" == strtolower($node->nodeName)) { $newsParams['data']['category'] = $node->nodeValue; } } if ($linkPattern == $node->nodeName) { $newsParams['link'] = str_replace($this->source->source->url, '', $node->nodeValue); // $newsParams['link'] = $node->nodeValue; } if ($imagePattern == $node->nodeName) { if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->nodeValue, $images)) { $newsParams['image_src'] = $images[1][0]; } } if (!isset($newsParams['image_src'])) { if ($node->nodeName == 'enclosure') { if (preg_match_all('/(https?:\\/\\/[a-z0-9\\/_а-я\\-\\.]*\\.(?:png|jpg))/i', $node->getAttribute('url'), $images)) { $newsParams['image_src'] = $images[1][0]; } } } } try { if (array_key_exists("link", $newsParams)) { $pqItem = new ParserQueue(); $pqItem->source_id = $this->source->source_id; $pqItem->url = $newsParams['link']; $pqItem->status = ParserQueue::STATUS_INPROCESS; $pqItem->created_at = new \yii\db\Expression('NOW()'); $pqItem->updated_at = new \yii\db\Expression('NOW()'); if ($pqItem->save()) { $pn = new PendingNews(); $pn->content = ' '; $pn->search_content = ' '; $pn->source_id = $this->source->source_id; if (isset($newsParams['title'])) { $pn->title = $newsParams['title']; } $pn->status = PendingNews::STATUS_SUSPENDED; $pn->group_hash = md5(time()); if (isset($newsParams['image_src'])) { $pn->thumb_src = $newsParams['image_src']; } if ($pqItem) { $pn->pq_id = $pqItem->id; } if (!empty($newsParams['data'])) { $pn->additonal_data = json_encode($newsParams['data']); } $pn->created_at = new \yii\db\Expression("NOW()"); $pn->update_at = new \yii\db\Expression("NOW()"); if ($pn->save()) { if ($pqItem) { $pqItem->status = ParserQueue::STATUS_DONE; $pqItem->save(); $this->mq->postMessage("parse", "parse_rss", json_encode(["pn_id" => $pn->id, "pq_id" => $pqItem->id])); } } else { if ($pqItem) { $pqItem->status = ParserQueue::STATUS_FAIL; $pqItem->save(); } } } else { // print_r( $pqItem->errors ); } } } catch (\yii\db\Exception $e) { // print_r( $e->getMessage() ); } } } } }