Esempio n. 1
0
 public static function sortByWordsCount($set_1, $set_2)
 {
     $first = str_word_count($set_1, 0, Analyzer::getCyrChars(["*"]));
     $second = str_word_count($set_2, 0, Analyzer::getCyrChars(["*"]));
     if ($first == $second) {
         return 0;
     }
     return $first > $second ? -1 : 1;
 }
Esempio n. 2
0
 public function count_entries($zones, $n_gramms, $files_arrays, $paths, $urls_count)
 {
     $results = [];
     $units_set = [];
     $counter = new Counter();
     $lemmatizator = new Lemmatizator();
     $analyzer = new Analyzer();
     foreach ($zones as $zone) {
         foreach ($n_gramms as $query => $n_gramms_array) {
             $N_size = count($n_gramms_array);
             for ($n = $N_size; $n > 0; $n--) {
                 foreach ($n_gramms_array[$n] as $n_gramm) {
                     // count n-gramm AS IS direct order
                     // --------------------------------
                     $results[$query][$zone][$n_gramm] = !isset($results[$query][$zone][$n_gramm]) ? $counter->countUnitInFileSet($n_gramm, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$n_gramm];
                     // file_put_contents("/var/www/test.txt", $n_gramm."\n", FILE_APPEND);
                     if (!in_array($n_gramm, $units_set)) {
                         $units_set[] = $n_gramm;
                     }
                     // count n-gramm NORMAL forms direct order
                     // ---------------------------------------
                     $n_gramm_normal = $lemmatizator->getUnitWithForms($n_gramm);
                     $unit_normal = $lemmatizator->makeUnitNormal($n_gramm);
                     $results[$query][$zone][$unit_normal . " - FORMS"] = !isset($results[$query][$zone][$unit_normal . " - FORMS"]) ? $counter->countUnitInFileSet($n_gramm_normal, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$unit_normal . " - FORMS"];
                     // file_put_contents("/var/www/test.txt", $unit_normal." - FORMS"."\n", FILE_APPEND);
                     if (!in_array($unit_normal . " - FORMS", $units_set)) {
                         $units_set[] = $unit_normal . " - FORMS";
                     }
                     // if( !in_array($n_gramm." - FORMS", $units_set) ){$units_set[] = $n_gramm." - FORMS";}
                     // count n-gramm AS IS PERMUTATED all variants
                     // -------------------------------------------
                     $n_gramm_permutated_array = $analyzer->permutate(explode(" ", $n_gramm));
                     foreach ($n_gramm_permutated_array as $n_gramm_permutated) {
                         $results[$query][$zone][$n_gramm_permutated] = !isset($results[$query][$zone][$n_gramm_permutated]) ? $counter->countUnitInFileSet($n_gramm_permutated, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$n_gramm_permutated];
                         // file_put_contents("/var/www/test.txt", $n_gramm_permutated."\n", FILE_APPEND);
                         if (!in_array($n_gramm_permutated, $units_set)) {
                             $units_set[] = $n_gramm_permutated;
                         }
                         // count NORMAL form for this n-gramm (as is permutated)
                         // ----------------------------------------------------
                         $n_gramm_permutated_normal = $lemmatizator->getUnitWithForms($n_gramm_permutated);
                         $unit_normal = $lemmatizator->makeUnitNormal($n_gramm_permutated);
                         $results[$query][$zone][$unit_normal . " - FORMS"] = !isset($results[$query][$zone][$unit_normal . " - FORMS"]) ? $counter->countUnitInFileSet($n_gramm_permutated_normal, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$unit_normal . " - FORMS"];
                         // file_put_contents("/var/www/test.txt", $unit_normal." - FORMS"."\n", FILE_APPEND);
                         if (!in_array($unit_normal . " - FORMS", $units_set)) {
                             $units_set[] = $unit_normal . " - FORMS";
                         }
                         // if( !in_array($n_gramm_permutated." - FORMS", $units_set) ){$units_set[] = $n_gramm_permutated." - FORMS";}
                     }
                     // count n-gramm AS IS with STARS added all variants (length = n + 1star) DIRECT order
                     // -----------------------------------------------------------------------------------
                     $n_gramm_stared_array = $analyzer->getWithStars(explode(" ", $n_gramm));
                     foreach ($n_gramm_stared_array as $n_gramm_stared) {
                         $results[$query][$zone][$n_gramm_stared] = !isset($results[$query][$zone][$n_gramm_stared]) ? $counter->countUnitInFileSet($n_gramm_stared, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$n_gramm_stared];
                         // file_put_contents("/var/www/test.txt", $n_gramm_stared."\n", FILE_APPEND);
                         if (!in_array($n_gramm_stared, $units_set)) {
                             $units_set[] = $n_gramm_stared;
                         }
                         // count NORMAL form for this n-gramm (with STARS length = n+1star)
                         // ----------------------------------------------------------------
                         $n_gramm_stared_normal = $lemmatizator->getUnitWithForms($n_gramm_stared);
                         $unit_normal = $lemmatizator->makeUnitNormal($n_gramm_stared);
                         $results[$query][$zone][$unit_normal . " - FORMS"] = !isset($results[$query][$zone][$unit_normal . " - FORMS"]) ? $counter->countUnitInFileSet($n_gramm_stared_normal, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$unit_normal . " - FORMS"];
                         // file_put_contents("/var/www/test.txt", $unit_normal." - FORMS"."\n", FILE_APPEND);
                         if (!in_array($unit_normal . " - FORMS", $units_set)) {
                             $units_set[] = $unit_normal . " - FORMS";
                         }
                         // if( !in_array($n_gramm_stared." - FORMS", $units_set) ){$units_set[] = $n_gramm_stared." - FORMS";}
                     }
                     // count n-gramm AS IS with STARS added INSTEAD the word all variants (length = n with the star) DIRECT order
                     // -----------------------------------------------------------------------------------
                     $n_gramm_stared_instead_array = $analyzer->getWithStarsInstead(explode(" ", $n_gramm));
                     foreach ($n_gramm_stared_instead_array as $n_gramm_instead) {
                         $results[$query][$zone][$n_gramm_instead] = !isset($results[$query][$zone][$n_gramm_instead]) ? $counter->countUnitInFileSet($n_gramm_instead, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$n_gramm_instead];
                         // file_put_contents("/var/www/test.txt", $n_gramm_instead."\n", FILE_APPEND);
                         if (!in_array($n_gramm_instead, $units_set)) {
                             $units_set[] = $n_gramm_instead;
                         }
                         // count NORMAL form for this n-gramm (with STARS length = n with the star) DIRECT order
                         // ----------------------------------------------------------------
                         $n_gramm_stared_instead_normal = $lemmatizator->getUnitWithForms($n_gramm_instead);
                         $unit_normal = $lemmatizator->makeUnitNormal($n_gramm_instead);
                         $results[$query][$zone][$unit_normal . " - FORMS"] = !isset($results[$query][$zone][$unit_normal . " - FORMS"]) ? $counter->countUnitInFileSet($n_gramm_stared_instead_normal, $files_arrays[$zone][$query], $paths[$zone]) : $results[$query][$zone][$unit_normal . " - FORMS"];
                         // file_put_contents("/var/www/test.txt", $unit_normal." - FORMS"."\n", FILE_APPEND);
                         if (!in_array($unit_normal . " - FORMS", $units_set)) {
                             $units_set[] = $unit_normal . " - FORMS";
                         }
                         // if( !in_array($n_gramm_instead." - FORMS", $units_set) ){$units_set[] = $n_gramm_instead." - FORMS";};
                     }
                 }
             }
             //END of foreach - each single n-gramm
         }
         //END of for size of n-gramms
     }
     //END of foreach for $zones
     // file_put_contents("/var/www/test.txt", print_r($results,true)."\n", FILE_APPEND);
     $combined_result = $this->combineCommonUnits($results, $units_set, $urls_count);
     return $combined_result;
 }