示例#1
0
 public function handleData($sample_cleaned_content, $data, $project, $last = null)
 {
     $project_id = $project->id;
     $zones = $this->allZones();
     //get the array of successfully fetched and cleaned from garbage html-files
     //encoding is also already set to utf-8
     $html_res = $this->getHtmlFiles($data, $project_id, $last);
     $html_files_data = $html_res[0];
     $urls_count = $html_res[1];
     //fetch the simple zones content and put to the separate files (also the sample)
     $files_for_parsing = $this->handle_html($html_files_data, $project);
     $content = new Content();
     $wo_garbage_sample = storage_path("tmp/" . $project->dir . "/html_wo_garbage/sample_0.html");
     $wo_simple_sample = $content->getSimpleZones($wo_garbage_sample, $project_id);
     //get the node lists for each file (also the sample)
     $nodes_arrays = $this->parse_files($files_for_parsing, $project);
     $parser = new Parser();
     $sample_body_array = $parser->getBodyNode(storage_path("tmp/" . $project->dir . "/html_wo_simple/sample_0.html"), $project);
     // $sample_nodes_array = $parser->parseAsContainers($sample_body_array);
     $sample_nodes_array = $parser->parseAsValya($sample_body_array);
     // $this->get_test($sample_nodes_array);
     // return;
     // get the n-gramms array
     // (keys:the queries texts as is) => array of the possible sentences as is (key:number of words => all possible parts of the query text)
     $analyzer = new Analyzer();
     $lemmatizator = new Lemmatizator();
     $queries_splited = [];
     foreach ($project->queries as $query) {
         $query_text = $query->text;
         $queries_splited = array_merge($queries_splited, $lemmatizator->splitToWords($query_text));
     }
     $n_gramms = [];
     foreach ($queries_splited as $query_text => $words_set) {
         $n_gramms[$query_text] = $analyzer->getNGramms($words_set);
     }
     //put text-content of the nodes to PLAIN/FRAGMENT txt files
     //arange the arrays of files in accordance with zone/query
     //remove the whitespaces etc (makeTextShort function)
     $files_arrays_arr = $this->makeZonesFilesArrays($nodes_arrays, $project);
     //массив имен файлов
     $files_arrays = $files_arrays_arr[0];
     //массив с количеством слов в каждом файле по каждой зоне
     $words = $files_arrays_arr[1];
     $paths = $this->getZonesPaths($zones, $project);
     //формируем plain/fragment файлы для sample
     file_put_contents($paths['fragment'] . "sample_0.txt", "");
     file_put_contents($paths['plain'] . "sample_0.txt", "");
     $this->separatePlainFromFragment($sample_nodes_array, $paths, "sample_0.txt");
     foreach ($zones as $zone) {
         $raw_content = file_get_contents($paths[$zone] . "sample_0.txt");
         $words_sample[$zone] = str_word_count($raw_content, 0, "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя*");
         $short_content = $lemmatizator->makeTextShort($raw_content);
         file_put_contents($paths[$zone] . "sample_0.txt", $short_content);
         unset($raw_content);
         //на всякий случай
         unset($short_content);
         //на всякий случай
     }
     foreach ($zones as $zone) {
         $sample_files_arrays[$zone]['sample'][] = 'sample_0.txt';
     }
     //count entries of each n_gramm to each file/zone
     $res = $this->count_entries($zones, $n_gramms, $files_arrays, $paths, $urls_count);
     //массив результатов скомбинированный по всем запросам
     $combined_result = $res[0];
     //массив со значениями bodies
     $bodies = $res[1];
     //объединенный по всем запросам массив n-грамм для sample
     $n_gramms_sample = $this->makeCommonUnitsSet($n_gramms);
     $res_sample = $this->count_entries($zones, $n_gramms_sample, $sample_files_arrays, $paths, 1);
     $sample_combined_result = [];
     foreach ($combined_result as $zone => $units) {
         foreach ($units as $unit => $data) {
             if (!isset($res_sample[0][$zone][$unit])) {
                 $sample_combined_result[$zone][$unit] = 0;
             } else {
                 $sample_combined_result[$zone][$unit] = $res_sample[0][$zone][$unit]['entries'][0];
             }
         }
     }
     $storekeeper = new Storekeeper();
     //собираем все вместе и отправляем на формирование отчетов
     $storekeeper->storeResultsAdmin($combined_result, $bodies, $sample_combined_result, $project, $words, $words_sample);
 }
 public function handle_referencies(Request $request, Reference $reference, Lemmatizator $lemmatizator, Storekeeper $storekeeper)
 {
     $this->authorize('client-usage', $request->user());
     $raw_data = $request->all();
     // file_put_contents("/var/www/test.txt", print_r($raw_data,true));
     // validates if there is any notEmpty queries in the set
     // validates if the sample url is valid
     // validates if project_id or new project name is present
     if (isset($raw_data['ref'])) {
         $raw_data['ref'] = $this->removeSecondScreenDuplicates($raw_data['ref']);
     }
     if (!empty($raw_data['ref'])) {
         $raw_data['ref'] = $this->removeQueriesWithNoRefs($raw_data['ref']);
     }
     if (isset($raw_data['sample_url'])) {
         $raw_data['sample_url'] = trim($raw_data['sample_url']);
     }
     $validator = Validator::make($raw_data, ['sample_url' => 'required_without:sample_html|url', 'sample_html' => 'required_without:sample_url|mimes:html,htm,htmls,odt,oth,ott,doc,dot,docx,dotx', 'ref' => 'required', 'query' => 'required_without:project_id', 'project_name' => 'required_without:project_id|max:20|alpha_num_dash_lat', 'project_id' => 'required_without:project_name']);
     //validates if mentioned only one of url or local file for sample
     $validator->after(function ($validator) {
         if (Input::get('sample_url') != '' && Input::hasFile('sample_html')) {
             $validator->errors()->add('sample_url', 'Choose smth one between remote url or local file');
         }
     });
     //make $inputed_responce using ref_dirty data to fill form fields with already edited by user data if validation fails
     if ($validator->fails()) {
         $id = isset($raw_data['project_id']) ? $raw_data['project_id'] : null;
         $inputed_responce = [];
         foreach (Input::get('ref_dirty') as $query => $dirty_refs) {
             if ($dirty_refs) {
                 $query = $query == '0' ? "" : $query;
                 $inputed_responce[$query] = $dirty_refs;
             } else {
                 $inputed_responce = [];
                 break;
             }
         }
         return redirect()->action('WebsiteController@referencies_handler', [$id])->withErrors($validator)->withInput(Input::except('ref', 'query', 'ref_dirty'))->with('inputed_responce', $inputed_responce);
     }
     if (isset($raw_data['last'])) {
         $last = $raw_data['last'];
     } else {
         $last = null;
     }
     //get the project_id from INPUT or set the new one
     if (isset($raw_data['project_id'])) {
         $project_id = strip_tags($raw_data['project_id']);
     } else {
         // mkdir empty dirs which are necessary for query handling
         // initialize project
         $new_project = $storekeeper->initializeStoring($raw_data['project_name'], $raw_data['query']);
         $project_id = $new_project->id;
     }
     //set exact flag to the project if it is checked
     if (isset($raw_data['exact'])) {
         $this_project = Project::find($project_id);
         $this_project->exact = true;
         $this_project->save();
     }
     //store additional queries to file at project dir
     if (isset($raw_data['dop_queries'])) {
         $raw_query['dop_queries'] = strip_tags($raw_data['dop_queries']);
         $dop_queries_array = preg_split("/[,;\\.]*[\t\n\f\r]+\\s*/", trim($raw_data['dop_queries']), -1, PREG_SPLIT_NO_EMPTY);
     } else {
         $dop_queries_array = [];
     }
     //check if dop_queries duplicate main queries
     foreach ($dop_queries_array as $num => $dop_query) {
         $dop_query = $this->clean_query($dop_query);
         if (in_array($dop_query, array_keys($raw_data['ref']))) {
             unset($dop_queries_array[$num]);
         }
     }
     if (!empty($dop_queries_array)) {
         foreach ($dop_queries_array as $key => $raw_query) {
             $dop_query = trim(array_keys($lemmatizator->splitToWords($raw_query))[0]);
             $dop_queries_array[$key] = $dop_query;
         }
         $dop_queries = implode("\n", $dop_queries_array);
         $project_dir = Project::where('id', $project_id)->first()->dir;
         file_put_contents(storage_path("tmp/" . $project_dir . "/reports/dop_queries.txt"), $dop_queries);
     }
     // obtain an array of referencies instead of text from textarea
     // text is splitted against different white-spaces and the commas
     foreach ($raw_data['ref'] as $ref => $urls_text) {
         //splitToWords ЗДЕСЬ используется только для "очистки" запроса
         //меняется е на ё, убираются стоп-слова и т.д.
         //splitToWords возвращает массив, в котором ключами являются тексты запросов, а значениями массивы составляющих их слов
         //используем splitToWords для того, чтобы "очистка" была одинаковой на всех этапах обработки
         $query = array_keys($lemmatizator->splitToWords($ref))[0];
         //заменяем запрос на очищенный в переменной last
         if ($query != $ref && !is_null($last)) {
             $last[$query] = $last[$ref];
             unset($last[$ref]);
         }
         //make array of urls instead of text
         $data[$query] = preg_split("/[\\s,]+/", $urls_text);
     }
     $responce = [];
     $refs_handled = [];
     foreach ($data as $query => $urls_array) {
         foreach ($urls_array as $key => $url) {
             //remove empty strings if any
             if ($url == "") {
                 unset($data[$query][$key]);
             }
         }
     }
     if ($raw_data['sample_url'] || Input::hasFile('sample_html')) {
         if ($raw_data['sample_url']) {
             $src_type = 'http';
             $sample_url = Input::get('sample_url');
         } else {
             $src_type = 'local';
             $sample_url = Input::file('sample_html')->getRealPath();
         }
         //get html of sample from file or from www
         try {
             $sample_content = $reference->getHtml($sample_url, 'sample', 0, $project_id, $src_type);
             $sample_cleaned_content = $reference->removeGarbage($sample_content, $project_id, "sample_0.html");
         } catch (\Exception $e) {
             $mes = $e->getMessage();
             $inputed_responce = [];
             foreach (Input::get('ref') as $query => $refs) {
                 $query = $query == '0' ? "" : $query;
                 $inputed_responce[$query] = $refs;
             }
             return redirect()->action('WebsiteController@referencies_handler', [$project_id])->with('alert', "Cannot fetch html from the sample url: " . $sample_url . "\n" . $mes)->withInput(Input::except('ref', 'query'))->with('inputed_responce', $inputed_responce);
         }
     } else {
         $sample_file_name = 'sample_0.html';
         $sample_cleaned_content = '';
     }
     // return '<pre>'.print_r($last,true).'</pre>';
     $this->dispatch(new HandleDataJob($sample_cleaned_content, $data, $project_id, $last));
     return redirect()->action('UserController@show_user', [$request->user()->id])->with('info', "Your Project is in process. Gather report when it'll appear in reports column.");
 }