public function getHtmlFiles($data, $project_id, $last) { $reference = new Reference(); $responce = []; //массиы для хранения ошибок $responce['errors'] = []; //массив для хранения "успешных" урлов $refs_handled = []; //счетчик урлов $ref_counts = []; //максимальное количество запасных урлов, которые можно взять на замену $max_extra_urls = 2; foreach ($data as $query => $urls_array) { $ref_count = 0; // $max_ref_count = intval(( floor( ( count($urls_array) - 1 )/10.0 ) )*10); // if( $max_ref_count < 10 ){ $max_ref_count = 10; } if (!is_null($last)) { $max_ref_count = $last[$query]; } else { //вдруг last для этого запроса не установлен почему-то $max_ref_count = 10; } foreach ($urls_array as $url) { if ($ref_count < $max_ref_count) { //получить html $html_content = $reference->getHtml($url, $query, $ref_count, $project_id); //если ошибка - берем запасной урл if (strpos($html_content, "ERROR: Can not")) { $responce['errors'][] = $html_content; if ($max_extra_urls > 0) { $max_ref_count++; $max_extra_urls--; } } else { $filename = $query . "_" . $ref_count . ".html"; //"очистка" html и запись в папку html_wo_garbage $cleaned_content = $reference->removeGarbage($html_content, $project_id, $filename); if (strpos($cleaned_content, "ERROR: Can not")) { $responce['errors'][] = $cleaned_content; //если ошибка - замена урла на запасной if ($max_extra_urls > 0) { $max_ref_count++; $max_extra_urls--; } } else { $responce[$query][] = $filename; $refs_handled[$query][] = $url; $ref_count += 1; } } } } $ref_counts[$query] = $ref_count; } $storekeeper = new Storekeeper(); //запись в файл reports/Handled_referencies.txt тех урлов, которые реально будут обрабатываться $storekeeper->storeReferencies($refs_handled, $project_id); //запись в файл reports/errors.txt ошибок при получении и первичной обработке html-документов $storekeeper->storeErrors($responce['errors'], $project_id); unset($responce['errors']); return [$responce, $ref_counts]; //the array of filenames with the query-text as the key // return [$responce, $max_ref_count];//the array of filenames with the query-text as the key }
public function handle_referencies(Request $request, Reference $reference, Lemmatizator $lemmatizator, Storekeeper $storekeeper) { $this->authorize('client-usage', $request->user()); $raw_data = $request->all(); // file_put_contents("/var/www/test.txt", print_r($raw_data,true)); // validates if there is any notEmpty queries in the set // validates if the sample url is valid // validates if project_id or new project name is present if (isset($raw_data['ref'])) { $raw_data['ref'] = $this->removeSecondScreenDuplicates($raw_data['ref']); } if (!empty($raw_data['ref'])) { $raw_data['ref'] = $this->removeQueriesWithNoRefs($raw_data['ref']); } if (isset($raw_data['sample_url'])) { $raw_data['sample_url'] = trim($raw_data['sample_url']); } $validator = Validator::make($raw_data, ['sample_url' => 'required_without:sample_html|url', 'sample_html' => 'required_without:sample_url|mimes:html,htm,htmls,odt,oth,ott,doc,dot,docx,dotx', 'ref' => 'required', 'query' => 'required_without:project_id', 'project_name' => 'required_without:project_id|max:20|alpha_num_dash_lat', 'project_id' => 'required_without:project_name']); //validates if mentioned only one of url or local file for sample $validator->after(function ($validator) { if (Input::get('sample_url') != '' && Input::hasFile('sample_html')) { $validator->errors()->add('sample_url', 'Choose smth one between remote url or local file'); } }); //make $inputed_responce using ref_dirty data to fill form fields with already edited by user data if validation fails if ($validator->fails()) { $id = isset($raw_data['project_id']) ? $raw_data['project_id'] : null; $inputed_responce = []; foreach (Input::get('ref_dirty') as $query => $dirty_refs) { if ($dirty_refs) { $query = $query == '0' ? "" : $query; $inputed_responce[$query] = $dirty_refs; } else { $inputed_responce = []; break; } } return redirect()->action('WebsiteController@referencies_handler', [$id])->withErrors($validator)->withInput(Input::except('ref', 'query', 'ref_dirty'))->with('inputed_responce', $inputed_responce); } if (isset($raw_data['last'])) { $last = $raw_data['last']; } else { $last = null; } //get the project_id from INPUT or set the new one if (isset($raw_data['project_id'])) { $project_id = strip_tags($raw_data['project_id']); } else { // mkdir empty dirs which are necessary for query handling // initialize project $new_project = $storekeeper->initializeStoring($raw_data['project_name'], $raw_data['query']); $project_id = $new_project->id; } //set exact flag to the project if it is checked if (isset($raw_data['exact'])) { $this_project = Project::find($project_id); $this_project->exact = true; $this_project->save(); } //store additional queries to file at project dir if (isset($raw_data['dop_queries'])) { $raw_query['dop_queries'] = strip_tags($raw_data['dop_queries']); $dop_queries_array = preg_split("/[,;\\.]*[\t\n\f\r]+\\s*/", trim($raw_data['dop_queries']), -1, PREG_SPLIT_NO_EMPTY); } else { $dop_queries_array = []; } //check if dop_queries duplicate main queries foreach ($dop_queries_array as $num => $dop_query) { $dop_query = $this->clean_query($dop_query); if (in_array($dop_query, array_keys($raw_data['ref']))) { unset($dop_queries_array[$num]); } } if (!empty($dop_queries_array)) { foreach ($dop_queries_array as $key => $raw_query) { $dop_query = trim(array_keys($lemmatizator->splitToWords($raw_query))[0]); $dop_queries_array[$key] = $dop_query; } $dop_queries = implode("\n", $dop_queries_array); $project_dir = Project::where('id', $project_id)->first()->dir; file_put_contents(storage_path("tmp/" . $project_dir . "/reports/dop_queries.txt"), $dop_queries); } // obtain an array of referencies instead of text from textarea // text is splitted against different white-spaces and the commas foreach ($raw_data['ref'] as $ref => $urls_text) { //splitToWords ЗДЕСЬ используется только для "очистки" запроса //меняется е на ё, убираются стоп-слова и т.д. //splitToWords возвращает массив, в котором ключами являются тексты запросов, а значениями массивы составляющих их слов //используем splitToWords для того, чтобы "очистка" была одинаковой на всех этапах обработки $query = array_keys($lemmatizator->splitToWords($ref))[0]; //заменяем запрос на очищенный в переменной last if ($query != $ref && !is_null($last)) { $last[$query] = $last[$ref]; unset($last[$ref]); } //make array of urls instead of text $data[$query] = preg_split("/[\\s,]+/", $urls_text); } $responce = []; $refs_handled = []; foreach ($data as $query => $urls_array) { foreach ($urls_array as $key => $url) { //remove empty strings if any if ($url == "") { unset($data[$query][$key]); } } } if ($raw_data['sample_url'] || Input::hasFile('sample_html')) { if ($raw_data['sample_url']) { $src_type = 'http'; $sample_url = Input::get('sample_url'); } else { $src_type = 'local'; $sample_url = Input::file('sample_html')->getRealPath(); } //get html of sample from file or from www try { $sample_content = $reference->getHtml($sample_url, 'sample', 0, $project_id, $src_type); $sample_cleaned_content = $reference->removeGarbage($sample_content, $project_id, "sample_0.html"); } catch (\Exception $e) { $mes = $e->getMessage(); $inputed_responce = []; foreach (Input::get('ref') as $query => $refs) { $query = $query == '0' ? "" : $query; $inputed_responce[$query] = $refs; } return redirect()->action('WebsiteController@referencies_handler', [$project_id])->with('alert', "Cannot fetch html from the sample url: " . $sample_url . "\n" . $mes)->withInput(Input::except('ref', 'query'))->with('inputed_responce', $inputed_responce); } } else { $sample_file_name = 'sample_0.html'; $sample_cleaned_content = ''; } // return '<pre>'.print_r($last,true).'</pre>'; $this->dispatch(new HandleDataJob($sample_cleaned_content, $data, $project_id, $last)); return redirect()->action('UserController@show_user', [$request->user()->id])->with('info', "Your Project is in process. Gather report when it'll appear in reports column."); }