/** * Store a newly created resource in storage. * * @param \Illuminate\Http\Request $request * @return \Illuminate\Http\Response */ public function store(Guard $auth, Request $request) { $task = new Task(); if ($request->get('max_document') <= 0) { $request->merge(['max_document' => 500]); } $task->fill($request->all()); $task->user_id = $auth->user()->id; $task->status = Task::STATUS_RUNNING; if ($task->save()) { $urlLog = new UrlLog(); $urlLog->task_id = $task->id; $urlLog->url = $request->get('url_index_crawl'); $urlLog->save(); shell_exec("nohup php crawlprocess.php --url=" . route('crawl.crawling', ['id' => $task->id]) . " >/dev/null 2>&1 &"); return redirect()->route('tasks.index'); } return redirect()->back(); }
public function crawl($id) { ignore_user_abort(true); set_time_limit(0); $status = 202; $task = Task::find($id); if (!is_null($task)) { if ($task->status === $task::STATUS_RUNNING) { date_default_timezone_set('Asia/Jakarta'); libxml_use_internal_errors(true); $urlLog = UrlLog::where('task_id', $id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->first(); $status = 201; $countDocuments = $task->texts()->count(); if (!is_null($urlLog) && $countDocuments < $task->max_document) { $status = 200; $task = $urlLog->task; try { $html = file_get_contents($urlLog->url); $dom = new \DOMDocument(); $dom->loadHTML($html); if ($task->type_document === 'Article') { $articles = $dom->getElementsByTagName('article'); if ($articles->length == 1 && strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) { $textCrawl = new TextCrawl(); $textCrawl->task_id = $urlLog->task_id; $textCrawl->text = $dom->saveHTML($articles[0]); $textCrawl->save(); } else { $urls = $dom->getElementsByTagName('a'); foreach ($urls as $url) { $link = preg_replace('/\\/$/', '', $url->getAttribute("href")); if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) { $sub = explode(strtolower($task->url_article_crawl), $link); if (strpos($sub[1], '/') === false) { if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) { $newUrlLog = new UrlLog(); $newUrlLog->task_id = $id; $newUrlLog->url = $link; $newUrlLog->status = $newUrlLog::STATUS_WAITING; $newUrlLog->save(); } } } } } } else { if ($task->type_document === 'Raw') { if (strpos(strtolower($urlLog->url), strtolower($task->url_article_crawl)) !== false && strpos(strtolower($urlLog->url), strtolower($task->url_pagination_crawl)) === false) { $textCrawl = new TextCrawl(); $textCrawl->task_id = $urlLog->task_id; $textCrawl->text = $dom->saveHTML(); $textCrawl->save(); } else { $urls = $dom->getElementsByTagName('a'); foreach ($urls as $url) { $link = preg_replace('/\\/$/', '', $url->getAttribute("href")); if (strpos(strtolower($link), strtolower($task->url_article_crawl)) !== false || strpos(strtolower($link), strtolower($task->url_pagination_crawl)) !== false) { $sub = explode(strtolower($task->url_article_crawl), $link); if (strpos($sub[1], '/') === false) { if (UrlLog::where('url', $link)->where('task_id', $id)->count() < 1) { $newUrlLog = new UrlLog(); $newUrlLog->task_id = $id; $newUrlLog->url = $link; $newUrlLog->status = $newUrlLog::STATUS_WAITING; $newUrlLog->save(); } } } } } } } } catch (Exception $e) { } $urlLog->status = $urlLog::STATUS_FINISH; $urlLog->save(); } else { $task->status = $task::STATUS_FINISH; $task->save(); UrlLog::where('task_id', $task->id)->where('status', \App\Model\UrlLog::STATUS_WAITING)->update(['status' => \App\Model\UrlLog::STATUS_FINISH]); } libxml_use_internal_errors(false); if ($status === 201 && (is_null($urlLog) || $countDocuments < $task->max_document)) { $this->sendNotification($task->id); } } } return response()->json(['status' => $status]); }