Пример #1
0
 /**
  * Store a newly created resource in storage.
  *
  * @param  \Illuminate\Http\Request  $request
  * @return \Illuminate\Http\Response
  */
 public function store(Guard $auth, Request $request)
 {
     $task = new Task();
     if ($request->get('max_document') <= 0) {
         $request->merge(['max_document' => 500]);
     }
     $task->fill($request->all());
     $task->user_id = $auth->user()->id;
     $task->status = Task::STATUS_RUNNING;
     if ($task->save()) {
         $urlLog = new UrlLog();
         $urlLog->task_id = $task->id;
         $urlLog->url = $request->get('url_index_crawl');
         $urlLog->save();
         shell_exec("nohup php crawlprocess.php --url=" . route('crawl.crawling', ['id' => $task->id]) . " >/dev/null 2>&1 &");
         return redirect()->route('tasks.index');
     }
     return redirect()->back();
 }
Пример #2
0
 public function crawl($id)
 {
     ignore_user_abort(true);
     set_time_limit(0);
     $status = 202;
     $task = Task::find($id);
     if (!is_null($task)) {
         if ($task->status === $task::STATUS_RUNNING) {
             date_default_timezone_set('Asia/Jakarta');
             libxml_use_internal_errors(true);
             $urlLog = UrlLog::where('task_id', $id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->first();
             $status = 201;
             $countDocuments = $task->texts()->count();
             if (!is_null($urlLog) && $countDocuments < $task->max_document) {
                 $status = 200;
                 $task = $urlLog->task;
                 try {
                     $html = file_get_contents($urlLog->url);
                     $dom = new \DOMDocument();
                     $dom->loadHTML($html);
                     if ($task->type_document === 'Article') {
                         $articles = $dom->getElementsByTagName('article');
                         if ($articles->length == 1 && strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) {
                             $textCrawl = new TextCrawl();
                             $textCrawl->task_id = $urlLog->task_id;
                             $textCrawl->text = $dom->saveHTML($articles[0]);
                             $textCrawl->save();
                         } else {
                             $urls = $dom->getElementsByTagName('a');
                             foreach ($urls as $url) {
                                 $link = preg_replace('/\\/$/', '', $url->getAttribute("href"));
                                 if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) {
                                     $sub = explode(strtolower($task->url_article_crawl), $link);
                                     if (strpos($sub[1], '/') === false) {
                                         if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) {
                                             $newUrlLog = new UrlLog();
                                             $newUrlLog->task_id = $id;
                                             $newUrlLog->url = $link;
                                             $newUrlLog->status = $newUrlLog::STATUS_WAITING;
                                             $newUrlLog->save();
                                         }
                                     }
                                 }
                             }
                         }
                     } else {
                         if ($task->type_document === 'Raw') {
                             if (strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) {
                                 $textCrawl = new TextCrawl();
                                 $textCrawl->task_id = $urlLog->task_id;
                                 $textCrawl->text = $dom->saveHTML();
                                 $textCrawl->save();
                             } else {
                                 $urls = $dom->getElementsByTagName('a');
                                 foreach ($urls as $url) {
                                     $link = preg_replace('/\\/$/', '', $url->getAttribute("href"));
                                     if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) {
                                         $sub = explode(strtolower($task->url_article_crawl), $link);
                                         if (strpos($sub[1], '/') === false) {
                                             if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) {
                                                 $newUrlLog = new UrlLog();
                                                 $newUrlLog->task_id = $id;
                                                 $newUrlLog->url = $link;
                                                 $newUrlLog->status = $newUrlLog::STATUS_WAITING;
                                                 $newUrlLog->save();
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 } catch (Exception $e) {
                 }
                 $urlLog->status = $urlLog::STATUS_FINISH;
                 $urlLog->save();
             } else {
                 $task->status = $task::STATUS_FINISH;
                 $task->save();
                 UrlLog::where('task_id', $task->id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->update(['status' => \App\Model\UrlLog::STATUS_FINISH]);
             }
             libxml_use_internal_errors(false);
             if ($status === 201 && (is_null($urlLog) || $countDocuments < $task->max_document)) {
                 $this->sendNotification($task->id);
             }
         }
     }
     return response()->json(['status' => $status]);
 }