function combine_categorical($weight_label = null, $with_confidence = false, $add_confidence = false, $add_distribution = false, $add_count = False) { /* Returns the prediction combining votes by using the given weight: weight_label can be set as: None: plurality (1 vote per prediction) 'confidence': confidence weighted (confidence as a vote value) 'probability': probability weighted (probability as a vote value) If with_confidence is true, the combined confidence (as a weighted average of the confidences of the votes for the combined prediction) will also be given. */ $mode = array(); $weight = 0; if ($weight_label == null) { $weight = 1; } $instances = 0; foreach ($this->predictions as $prediction) { if ($weight_label != null) { if (!in_array($weight_label, array_values($this->COMBINATION_WEIGHTS))) { throw new Exception("Wrong weight_label value."); } if (!array_key_exists($weight_label, $prediction)) { throw new Exception("Not enough data to use the selected prediction method. Try creating your model anew"); } else { $weight = $prediction->{$weight_label}; } } $category = $prediction->prediction; if ($add_count) { $instances += $prediction->count; } if (array_key_exists(strval($category), $mode)) { $mode[strval($category)] = array("count" => $mode[strval($category)]["count"] + $weight, "order" => $mode[strval($category)]["order"]); } else { $mode[strval($category)] = array("count" => $weight, "order" => $prediction->order); } } uasort($mode, array($this, "sort_mode_items")); reset($mode); $prediction = key($mode); if ($with_confidence or $add_confidence) { if (array_key_exists('confidence', $this->predictions[0])) { return $this->weighted_confidence($prediction, $weight_label); } else { $combined_distribution = $this->combine_distribution(); $distribution = $combined_distribution[0]; $count = $combined_distribution[1]; $combined_confidence = ws_confidence($prediction, $distribution, 1.96, $count); } } if ($with_confidence) { return array($prediction, $combined_confidence); } if ($add_confidence or $add_distribution or $add_count) { $output = array("prediction" => $prediction); if ($add_confidence) { $output["confidence"] = $combined_confidence; } if ($add_distribution) { $grouped_dis = $this->grouped_distribution(); $output["distribution"] = $grouped_dis["distribution"]; $output["distribution_unit"] = $grouped_dis["distribution_unit"]; } } return $prediction; }
public function predict($input_data, $by_name = true, $print_path = false, $out = STDOUT, $with_confidence = false, $missing_strategy = Tree::LAST_PREDICTION, $add_confidence = false, $add_path = false, $add_distribution = false, $add_count = false, $add_median = false, $add_next = false, $add_min = false, $add_max = false, $multiple = null) { /* Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. */ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy $tree = $this->tree; if ($tree != null && $tree->regression && $missing_strategy == Tree::PROPORTIONAL && !$this->regression_ready) { throw new Exception("You needed to use proportional missing strategy, \n for regressions. Please install them before, using local predictions for the model."); } # Checks and cleans input_data leaving the fields used in the model $input_data = $this->filter_input_data($input_data, $by_name); # Strips affixes for numeric values and casts to the final field type $input_data = cast($input_data, $this->fields); $prediction = $tree->predict($input_data, null, $missing_strategy); # Prediction path if ($print_path == true) { fwrite($out, join(" AND ", $prediction->path) . ' => ' . $prediction->output . "\n"); fclose($out); } $output = $prediction; if ($with_confidence == true) { $output = $prediction; } if ($multiple != null && !$tree->regression) { $output = array(); $total_instances = floatval($prediction->count); $index = 0; foreach ($prediction->distribution as $index => $data) { $category = $data[0]; $instances = $data[1]; if (is_string($multiple) && $multiple == 'all' or is_int($multiple) && $index < $multiple) { $prediction_dict = array('prediction' => $category, 'confidence' => ws_confidence($category, $prediction->distribution), 'probability' => $instances / $total_instances, 'count' => $instances); array_push($output, $prediction_dict); } } } else { if ($add_confidence || $add_path || $add_distribution || $add_count || $add_median || $add_next || $add_min || $add_max) { $output = (object) array('prediction' => $prediction->output); if ($add_confidence) { $output->confidence = $prediction->confidence; } if ($add_path) { $output->path = $prediction->path; } if ($add_distribution) { $output->distribution = $prediction->distribution; $output->distribution_unit = $prediction->distribution_unit; } if ($add_count) { $output->count = $prediction->count; } if ($tree->regression && $add_median) { $output->median = $prediction->median; } if ($add_next) { $field = count($prediction->children) == 0 ? null : $prediction->children[0]->predicate->field; if ($field != null && array_key_exists($field, $this->fields)) { $field = $this->fields->{$field}->name; } $output->next = $field; } if ($tree->regression && $add_min) { $output->min = $prediction->min; } if ($tree->regression && $add_max) { $output->max = $prediction->max; } } } return $output; }
public function predict($input_data, $path = null, $missing_strategy = Tree::LAST_PREDICTION) { /* Makes a prediction based on a number of field values. The input fields must be keyed by Id. There are two possible strategies to predict when the value for the splitting field is missing: 0 - LAST_PREDICTION: the last issued prediction is returned. 1 - PROPORTIONAL: as we cannot choose between the two branches in the tree that stem from this split, we consider both. The algorithm goes on until the final leaves are reached and all their predictions are used to decide the final prediction. */ if ($path == null) { $path = array(); } if ($missing_strategy == Tree::PROPORTIONAL) { $predict_pro = $this->predict_proportional($input_data, $path); $final_distribution = $predict_pro[0]; $d_min = $predict_pro[1]; $d_max = $predict_pro[2]; $last_node = $predict_pro[3]; $distribution = array(); if ($this->regression) { // singular case // when the prediction is the one given in a 1-instance node if (count($final_distribution) == 1) { foreach ($final_distribution as $prediction => $instances) { if ($instances == 1) { return new Prediction($last_node->output, $path, $last_node->confidence, $last_node->distribution, $instances, $last_node->distribution_unit, $last_node->median, $last_node->children, $last_node->min, $last_node->max); } break; } } ksort($final_distribution); foreach ($final_distribution as $key => $val) { array_push($distribution, array(floatval($key), $val)); } $distribution_unit = 'counts'; if (count($distribution) > Tree::BINS_LIMIT) { $distribution_unit = 'bins'; } $distribution = merge_bins($distribution, Tree::BINS_LIMIT); $prediction = mean($distribution); $total_instances = 0; foreach ($distribution as $key => $val) { $total_instances += $val[1]; } $confidence = regression_error(unbiased_sample_variance($distribution, $prediction), $total_instances); return new Prediction($prediction, $path, $confidence, $distribution, $total_instances, $distribution_unit, dist_median($distribution, $total_instances), $last_node->children, $d_min, $d_max); } else { ksort($final_distribution); $distribution = array(); foreach ($final_distribution as $key => $val) { array_push($distribution, array($key, $val)); } return new Prediction($distribution[0][0], $path, ws_confidence($distribution[0][0], $final_distribution), $distribution, get_instances($distribution), 'categorial', null, $last_node->children, null, null); } } else { if ($this->children != null) { #&& array_key_exists(splitChildren($this->children), $input_data) ) { foreach ($this->children as $child) { if ($child->predicate->apply($input_data, $this->fields)) { $new_rule = $child->predicate->to_rule($this->fields); array_push($path, $new_rule); return $child->predict($input_data, $path); } } } return new Prediction($this->output, $path, $this->confidence, $this->distribution, get_instances($this->distribution), $this->distribution_unit, $this->regression == null ? null : $this->median, $this->children, $this->regression == null ? null : $this->min, $this->regression == null ? null : $this->max); } }
public function _predict($input_data, $by_name = true, $print_path = false, $out = STDOUT, $with_confidence = false, $missing_strategy = Tree::LAST_PREDICTION, $add_confidence = false, $add_path = false, $add_distribution = false, $add_count = false, $add_median = false, $add_next = false, $add_min = false, $add_max = false, $add_unused_fields = false, $multiple = null) { /* Old Method will be mantenined Makes a prediction based on a number of field values. By default the input fields must be keyed by field name but you can use `by_name` to input them directly keyed by id. input_data: Input data to be predicted by_name: Boolean, true if input_data is keyed by names print_path: Boolean, if true the rules that lead to the prediction are printed out: output handler with_confidence: Boolean, if true, all the information in the node (prediction, confidence, distribution and count) is returned in a list format missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy for missing fields add_confidence: Boolean, if true adds confidence to the dict output add_path: Boolean, if true adds path to the dict output add_distribution: Boolean, if true adds distribution info to the dict output add_count: Boolean, if true adds the number of instances in the node to the dict output add_median: Boolean, if true adds the median of the values in the distribution add_next: Boolean, if true adds the field that determines next split in the tree add_min: Boolean, if true adds the minimum value in the prediction's distribution (for regressions only) add_max: Boolean, if true adds the maximum value in the prediction's distribution (for regressions only) add_unused_fields: Boolean, if true adds the information about the fields in the input_data that are not being used in the model as predictors. multiple: For categorical fields, it will return the categories in the distribution of the predicted node as a list of arrays: array(array('prediction' => 'Iris-setosa', 'confidence'=> 0.9154 'probability'=> 0.97 'count'=> 97), array('prediction'=> 'Iris-virginica', 'confidence'=> 0.0103 'probability'=> 0.03, 'count'=> 3)) The value of this argument can either be an integer (maximum number of categories to be returned), or the literal 'all', that will cause the entire distribution in the node to be returned. */ # Checks if this is a regression model, using PROPORTIONAL # missing_strategy $tree = $this->tree; if ($tree != null && $tree->regression && $missing_strategy == Tree::PROPORTIONAL && !$this->regression_ready) { throw new Exception("You needed to use proportional missing strategy, \n for regressions. Please install them before, using local predictions for the model."); } # Checks and cleans input_data leaving the fields used in the model $new_data = $this->filter_input_data($input_data, $by_name, $add_unused_fields); if ($add_unused_fields) { $input_data = $new_data[0]; $unused_fields = $new_data[1]; } else { $input_data = $new_data; } # Strips affixes for numeric values and casts to the final field type $input_data = cast($input_data, $this->fields); $prediction = $tree->predict($input_data, null, $missing_strategy); # Prediction path if ($print_path == true) { fwrite($out, join(" AND ", $prediction->path) . ' => ' . $prediction->output . "\n"); fclose($out); } $output = $prediction; if ($with_confidence == true) { $output = array($prediction->output, $prediction->confidence, $prediction->distribution, $prediction->count, $prediction->median); } if ($multiple != null && !$tree->regression) { $output = array(); $total_instances = floatval($prediction->count); $index = 0; foreach ($prediction->distribution as $index => $data) { $category = $data[0]; $instances = $data[1]; if (is_string($multiple) && $multiple == 'all' or is_int($multiple) && $index < $multiple) { $prediction_dict = array('prediction' => $category, 'confidence' => ws_confidence($category, $prediction->distribution), 'probability' => $instances / $total_instances, 'count' => $instances); array_push($output, $prediction_dict); } } } else { if ($add_confidence || $add_path || $add_distribution || $add_count || $add_median || $add_next || $add_min || $add_max || $add_unused_fields) { $output = (object) array('prediction' => $prediction->output); if ($add_confidence) { $output->confidence = $prediction->confidence; } if ($add_path) { $output->path = $prediction->path; } if ($add_distribution) { $output->distribution = $prediction->distribution; $output->distribution_unit = $prediction->distribution_unit; } if ($add_count) { $output->count = $prediction->count; } if ($tree->regression && $add_median) { $output->median = $prediction->median; } if ($add_next) { $field = count($prediction->children) == 0 ? null : $prediction->children[0]->predicate->field; if ($field != null && array_key_exists($field, $this->fields)) { $field = $this->fields->{$field}->name; } $output->next = $field; } if ($tree->regression && $add_min) { $output->min = $prediction->min; } if ($tree->regression && $add_max) { $output->max = $prediction->max; } if ($add_unused_fields) { $output->unused_fields = $unused_fields; } } } return $output; }