Beispiel #1
0
 function combine_categorical($weight_label = null, $with_confidence = false, $add_confidence = false, $add_distribution = false, $add_count = False)
 {
     /*
        Returns the prediction combining votes by using the given weight:
        weight_label can be set as:
           None:          plurality (1 vote per prediction)
           'confidence':  confidence weighted (confidence as a vote value)
           'probability': probability weighted (probability as a vote value)
        
        If with_confidence is true, the combined confidence (as a weighted
        average of the confidences of the votes for the combined
        prediction) will also be given.
     */
     $mode = array();
     $weight = 0;
     if ($weight_label == null) {
         $weight = 1;
     }
     $instances = 0;
     foreach ($this->predictions as $prediction) {
         if ($weight_label != null) {
             if (!in_array($weight_label, array_values($this->COMBINATION_WEIGHTS))) {
                 throw new Exception("Wrong weight_label value.");
             }
             if (!array_key_exists($weight_label, $prediction)) {
                 throw new Exception("Not enough data to use the selected prediction method. Try creating your model anew");
             } else {
                 $weight = $prediction->{$weight_label};
             }
         }
         $category = $prediction->prediction;
         if ($add_count) {
             $instances += $prediction->count;
         }
         if (array_key_exists(strval($category), $mode)) {
             $mode[strval($category)] = array("count" => $mode[strval($category)]["count"] + $weight, "order" => $mode[strval($category)]["order"]);
         } else {
             $mode[strval($category)] = array("count" => $weight, "order" => $prediction->order);
         }
     }
     uasort($mode, array($this, "sort_mode_items"));
     reset($mode);
     $prediction = key($mode);
     if ($with_confidence or $add_confidence) {
         if (array_key_exists('confidence', $this->predictions[0])) {
             return $this->weighted_confidence($prediction, $weight_label);
         } else {
             $combined_distribution = $this->combine_distribution();
             $distribution = $combined_distribution[0];
             $count = $combined_distribution[1];
             $combined_confidence = ws_confidence($prediction, $distribution, 1.96, $count);
         }
     }
     if ($with_confidence) {
         return array($prediction, $combined_confidence);
     }
     if ($add_confidence or $add_distribution or $add_count) {
         $output = array("prediction" => $prediction);
         if ($add_confidence) {
             $output["confidence"] = $combined_confidence;
         }
         if ($add_distribution) {
             $grouped_dis = $this->grouped_distribution();
             $output["distribution"] = $grouped_dis["distribution"];
             $output["distribution_unit"] = $grouped_dis["distribution_unit"];
         }
     }
     return $prediction;
 }
Beispiel #2
0
 public function predict($input_data, $by_name = true, $print_path = false, $out = STDOUT, $with_confidence = false, $missing_strategy = Tree::LAST_PREDICTION, $add_confidence = false, $add_path = false, $add_distribution = false, $add_count = false, $add_median = false, $add_next = false, $add_min = false, $add_max = false, $multiple = null)
 {
     /*
              Makes a prediction based on a number of field values.
              By default the input fields must be keyed by field name but you can use
             `by_name` to input them directly keyed by id.
     */
     # Checks if this is a regression model, using PROPORTIONAL
     # missing_strategy
     $tree = $this->tree;
     if ($tree != null && $tree->regression && $missing_strategy == Tree::PROPORTIONAL && !$this->regression_ready) {
         throw new Exception("You needed to use proportional missing strategy, \n                         for regressions. Please install them before, using local predictions for the model.");
     }
     # Checks and cleans input_data leaving the fields used in the model
     $input_data = $this->filter_input_data($input_data, $by_name);
     # Strips affixes for numeric values and casts to the final field type
     $input_data = cast($input_data, $this->fields);
     $prediction = $tree->predict($input_data, null, $missing_strategy);
     # Prediction path
     if ($print_path == true) {
         fwrite($out, join(" AND ", $prediction->path) . ' => ' . $prediction->output . "\n");
         fclose($out);
     }
     $output = $prediction;
     if ($with_confidence == true) {
         $output = $prediction;
     }
     if ($multiple != null && !$tree->regression) {
         $output = array();
         $total_instances = floatval($prediction->count);
         $index = 0;
         foreach ($prediction->distribution as $index => $data) {
             $category = $data[0];
             $instances = $data[1];
             if (is_string($multiple) && $multiple == 'all' or is_int($multiple) && $index < $multiple) {
                 $prediction_dict = array('prediction' => $category, 'confidence' => ws_confidence($category, $prediction->distribution), 'probability' => $instances / $total_instances, 'count' => $instances);
                 array_push($output, $prediction_dict);
             }
         }
     } else {
         if ($add_confidence || $add_path || $add_distribution || $add_count || $add_median || $add_next || $add_min || $add_max) {
             $output = (object) array('prediction' => $prediction->output);
             if ($add_confidence) {
                 $output->confidence = $prediction->confidence;
             }
             if ($add_path) {
                 $output->path = $prediction->path;
             }
             if ($add_distribution) {
                 $output->distribution = $prediction->distribution;
                 $output->distribution_unit = $prediction->distribution_unit;
             }
             if ($add_count) {
                 $output->count = $prediction->count;
             }
             if ($tree->regression && $add_median) {
                 $output->median = $prediction->median;
             }
             if ($add_next) {
                 $field = count($prediction->children) == 0 ? null : $prediction->children[0]->predicate->field;
                 if ($field != null && array_key_exists($field, $this->fields)) {
                     $field = $this->fields->{$field}->name;
                 }
                 $output->next = $field;
             }
             if ($tree->regression && $add_min) {
                 $output->min = $prediction->min;
             }
             if ($tree->regression && $add_max) {
                 $output->max = $prediction->max;
             }
         }
     }
     return $output;
 }
Beispiel #3
0
 public function predict($input_data, $path = null, $missing_strategy = Tree::LAST_PREDICTION)
 {
     /*
        Makes a prediction based on a number of field values.
        The input fields must be keyed by Id. There are two possible
          strategies to predict when the value for the splitting field
          is missing:
        0 - LAST_PREDICTION: the last issued prediction is returned.
        1 - PROPORTIONAL: as we cannot choose between the two branches
           in the tree that stem from this split, we consider both. The
           algorithm goes on until the final leaves are reached and
           all their predictions are used to decide the final prediction.
     */
     if ($path == null) {
         $path = array();
     }
     if ($missing_strategy == Tree::PROPORTIONAL) {
         $predict_pro = $this->predict_proportional($input_data, $path);
         $final_distribution = $predict_pro[0];
         $d_min = $predict_pro[1];
         $d_max = $predict_pro[2];
         $last_node = $predict_pro[3];
         $distribution = array();
         if ($this->regression) {
             // singular case
             // when the prediction is the one given in a 1-instance node
             if (count($final_distribution) == 1) {
                 foreach ($final_distribution as $prediction => $instances) {
                     if ($instances == 1) {
                         return new Prediction($last_node->output, $path, $last_node->confidence, $last_node->distribution, $instances, $last_node->distribution_unit, $last_node->median, $last_node->children, $last_node->min, $last_node->max);
                     }
                     break;
                 }
             }
             ksort($final_distribution);
             foreach ($final_distribution as $key => $val) {
                 array_push($distribution, array(floatval($key), $val));
             }
             $distribution_unit = 'counts';
             if (count($distribution) > Tree::BINS_LIMIT) {
                 $distribution_unit = 'bins';
             }
             $distribution = merge_bins($distribution, Tree::BINS_LIMIT);
             $prediction = mean($distribution);
             $total_instances = 0;
             foreach ($distribution as $key => $val) {
                 $total_instances += $val[1];
             }
             $confidence = regression_error(unbiased_sample_variance($distribution, $prediction), $total_instances);
             return new Prediction($prediction, $path, $confidence, $distribution, $total_instances, $distribution_unit, dist_median($distribution, $total_instances), $last_node->children, $d_min, $d_max);
         } else {
             ksort($final_distribution);
             $distribution = array();
             foreach ($final_distribution as $key => $val) {
                 array_push($distribution, array($key, $val));
             }
             return new Prediction($distribution[0][0], $path, ws_confidence($distribution[0][0], $final_distribution), $distribution, get_instances($distribution), 'categorial', null, $last_node->children, null, null);
         }
     } else {
         if ($this->children != null) {
             #&&  array_key_exists(splitChildren($this->children), $input_data) ) {
             foreach ($this->children as $child) {
                 if ($child->predicate->apply($input_data, $this->fields)) {
                     $new_rule = $child->predicate->to_rule($this->fields);
                     array_push($path, $new_rule);
                     return $child->predict($input_data, $path);
                 }
             }
         }
         return new Prediction($this->output, $path, $this->confidence, $this->distribution, get_instances($this->distribution), $this->distribution_unit, $this->regression == null ? null : $this->median, $this->children, $this->regression == null ? null : $this->min, $this->regression == null ? null : $this->max);
     }
 }
Beispiel #4
0
 public function _predict($input_data, $by_name = true, $print_path = false, $out = STDOUT, $with_confidence = false, $missing_strategy = Tree::LAST_PREDICTION, $add_confidence = false, $add_path = false, $add_distribution = false, $add_count = false, $add_median = false, $add_next = false, $add_min = false, $add_max = false, $add_unused_fields = false, $multiple = null)
 {
     /* Old Method will be mantenined
              Makes a prediction based on a number of field values.
              By default the input fields must be keyed by field name but you can use
             `by_name` to input them directly keyed by id.
     
              input_data: Input data to be predicted
              by_name: Boolean, true if input_data is keyed by names
             print_path: Boolean, if true the rules that lead to the prediction
                         are printed
             out: output handler
             with_confidence: Boolean, if true, all the information in the node
                              (prediction, confidence, distribution and count)
                              is returned in a list format
             missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for
                               missing fields
             add_confidence: Boolean, if true adds confidence to the dict output
             add_path: Boolean, if true adds path to the dict output
             add_distribution: Boolean, if true adds distribution info to the
                               dict output
             add_count: Boolean, if true adds the number of instances in the
                            node to the dict output
             add_median: Boolean, if true adds the median of the values in
                         the distribution
             add_next: Boolean, if true adds the field that determines next
                       split in the tree
             add_min: Boolean, if true adds the minimum value in the prediction's
                      distribution (for regressions only)
             add_max: Boolean, if true adds the maximum value in the prediction's
                      distribution (for regressions only)
             add_unused_fields: Boolean, if true adds the information about the
                                fields in the input_data that are not being used
                                in the model as predictors.
             multiple: For categorical fields, it will return the categories
                       in the distribution of the predicted node as a
                       list of arrays:
                         array(array('prediction' => 'Iris-setosa',
                           'confidence'=> 0.9154
                           'probability'=> 0.97
                           'count'=> 97),
                          array('prediction'=> 'Iris-virginica',
                           'confidence'=> 0.0103
                           'probability'=> 0.03,
                           'count'=> 3))
                       The value of this argument can either be an integer
                       (maximum number of categories to be returned), or the
                       literal 'all', that will cause the entire distribution
                       in the node to be returned.
     
           */
     # Checks if this is a regression model, using PROPORTIONAL
     # missing_strategy
     $tree = $this->tree;
     if ($tree != null && $tree->regression && $missing_strategy == Tree::PROPORTIONAL && !$this->regression_ready) {
         throw new Exception("You needed to use proportional missing strategy, \n                         for regressions. Please install them before, using local predictions for the model.");
     }
     # Checks and cleans input_data leaving the fields used in the model
     $new_data = $this->filter_input_data($input_data, $by_name, $add_unused_fields);
     if ($add_unused_fields) {
         $input_data = $new_data[0];
         $unused_fields = $new_data[1];
     } else {
         $input_data = $new_data;
     }
     # Strips affixes for numeric values and casts to the final field type
     $input_data = cast($input_data, $this->fields);
     $prediction = $tree->predict($input_data, null, $missing_strategy);
     # Prediction path
     if ($print_path == true) {
         fwrite($out, join(" AND ", $prediction->path) . ' => ' . $prediction->output . "\n");
         fclose($out);
     }
     $output = $prediction;
     if ($with_confidence == true) {
         $output = array($prediction->output, $prediction->confidence, $prediction->distribution, $prediction->count, $prediction->median);
     }
     if ($multiple != null && !$tree->regression) {
         $output = array();
         $total_instances = floatval($prediction->count);
         $index = 0;
         foreach ($prediction->distribution as $index => $data) {
             $category = $data[0];
             $instances = $data[1];
             if (is_string($multiple) && $multiple == 'all' or is_int($multiple) && $index < $multiple) {
                 $prediction_dict = array('prediction' => $category, 'confidence' => ws_confidence($category, $prediction->distribution), 'probability' => $instances / $total_instances, 'count' => $instances);
                 array_push($output, $prediction_dict);
             }
         }
     } else {
         if ($add_confidence || $add_path || $add_distribution || $add_count || $add_median || $add_next || $add_min || $add_max || $add_unused_fields) {
             $output = (object) array('prediction' => $prediction->output);
             if ($add_confidence) {
                 $output->confidence = $prediction->confidence;
             }
             if ($add_path) {
                 $output->path = $prediction->path;
             }
             if ($add_distribution) {
                 $output->distribution = $prediction->distribution;
                 $output->distribution_unit = $prediction->distribution_unit;
             }
             if ($add_count) {
                 $output->count = $prediction->count;
             }
             if ($tree->regression && $add_median) {
                 $output->median = $prediction->median;
             }
             if ($add_next) {
                 $field = count($prediction->children) == 0 ? null : $prediction->children[0]->predicate->field;
                 if ($field != null && array_key_exists($field, $this->fields)) {
                     $field = $this->fields->{$field}->name;
                 }
                 $output->next = $field;
             }
             if ($tree->regression && $add_min) {
                 $output->min = $prediction->min;
             }
             if ($tree->regression && $add_max) {
                 $output->max = $prediction->max;
             }
             if ($add_unused_fields) {
                 $output->unused_fields = $unused_fields;
             }
         }
     }
     return $output;
 }