コード例 #1
0
 public function getTexts(Guard $auth, $taskId)
 {
     $task = Task::findOrFail($taskId);
     $texts = TextCrawl::where('task_id', $taskId)->take($task->max_document)->get();
     $title = "crawl/" . $task->id . "_" . \Carbon\Carbon::now()->format('Y-m-d-His') . "_texts.txt";
     foreach ($texts as $text) {
         file_put_contents($title, $text->text . "\n", FILE_APPEND);
     }
     return response()->download($title)->deleteFileAfterSend(true);
 }
コード例 #2
0
 public function crawl($id)
 {
     ignore_user_abort(true);
     set_time_limit(0);
     $status = 202;
     $task = Task::find($id);
     if (!is_null($task)) {
         if ($task->status === $task::STATUS_RUNNING) {
             date_default_timezone_set('Asia/Jakarta');
             libxml_use_internal_errors(true);
             $urlLog = UrlLog::where('task_id', $id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->first();
             $status = 201;
             $countDocuments = $task->texts()->count();
             if (!is_null($urlLog) && $countDocuments < $task->max_document) {
                 $status = 200;
                 $task = $urlLog->task;
                 try {
                     $html = file_get_contents($urlLog->url);
                     $dom = new \DOMDocument();
                     $dom->loadHTML($html);
                     if ($task->type_document === 'Article') {
                         $articles = $dom->getElementsByTagName('article');
                         if ($articles->length == 1 && strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) {
                             $textCrawl = new TextCrawl();
                             $textCrawl->task_id = $urlLog->task_id;
                             $textCrawl->text = $dom->saveHTML($articles[0]);
                             $textCrawl->save();
                         } else {
                             $urls = $dom->getElementsByTagName('a');
                             foreach ($urls as $url) {
                                 $link = preg_replace('/\\/$/', '', $url->getAttribute("href"));
                                 if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) {
                                     $sub = explode(strtolower($task->url_article_crawl), $link);
                                     if (strpos($sub[1], '/') === false) {
                                         if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) {
                                             $newUrlLog = new UrlLog();
                                             $newUrlLog->task_id = $id;
                                             $newUrlLog->url = $link;
                                             $newUrlLog->status = $newUrlLog::STATUS_WAITING;
                                             $newUrlLog->save();
                                         }
                                     }
                                 }
                             }
                         }
                     } else {
                         if ($task->type_document === 'Raw') {
                             if (strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) {
                                 $textCrawl = new TextCrawl();
                                 $textCrawl->task_id = $urlLog->task_id;
                                 $textCrawl->text = $dom->saveHTML();
                                 $textCrawl->save();
                             } else {
                                 $urls = $dom->getElementsByTagName('a');
                                 foreach ($urls as $url) {
                                     $link = preg_replace('/\\/$/', '', $url->getAttribute("href"));
                                     if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) {
                                         $sub = explode(strtolower($task->url_article_crawl), $link);
                                         if (strpos($sub[1], '/') === false) {
                                             if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) {
                                                 $newUrlLog = new UrlLog();
                                                 $newUrlLog->task_id = $id;
                                                 $newUrlLog->url = $link;
                                                 $newUrlLog->status = $newUrlLog::STATUS_WAITING;
                                                 $newUrlLog->save();
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 } catch (Exception $e) {
                 }
                 $urlLog->status = $urlLog::STATUS_FINISH;
                 $urlLog->save();
             } else {
                 $task->status = $task::STATUS_FINISH;
                 $task->save();
                 UrlLog::where('task_id', $task->id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->update(['status' => \App\Model\UrlLog::STATUS_FINISH]);
             }
             libxml_use_internal_errors(false);
             if ($status === 201 && (is_null($urlLog) || $countDocuments < $task->max_document)) {
                 $this->sendNotification($task->id);
             }
         }
     }
     return response()->json(['status' => $status]);
 }