<?php namespace headstart; require_once "autoload.inc.php"; use headstart\persistence; use headstart\library; $INI_DIR = dirname(__FILE__) . "/../../preprocessing/conf/"; $ini_array = library\Toolkit::loadIni($INI_DIR); $persistence = new persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); //$persistence->createTables(); //$persistence->createVisualization("test_id", "My Test Vis", "{[some json string;]}"); echo $persistence->getLastVersion("vis_id2"); //$persistence->writeRevision("test_id", "{[some other json string;]}"); //echo $persistence->getLastVersion("test_id");
public function performNaming($working_dir) { $ini = $this->ini_array["naming"]; $ini_general = $this->ini_array["general"]; $ini_output = $this->ini_array["output"]; $ini_connection = $this->ini_array["connection"]; $WORKING_DIR = $working_dir; //Output of scaling and clustering script $CLUSTERS = $WORKING_DIR . $ini_output["output_scaling_clustering"]; //Output file $OUTPUT_FILE = $WORKING_DIR . $ini_output["output_naming"]; //Output file for the full API responses $FULL_ZEMANTA = $WORKING_DIR . "full_responses/zemanta/"; $FULL_CALAIS = $WORKING_DIR . "full_responses/calais/"; $cluster = array(); $cluster_details = array("title" => array(), "abstracts" => array()); $counts = array(); $stop_words = array(); $output = array(); $cluster_text_file = library\Toolkit::openFileForReading($CLUSTERS); $stop_words_file = library\Toolkit::openFileForReading($ini_general["preprocessing_dir"] . $ini["stop_words"]); while (($line = fgetcsv($stop_words_file, null, "\t")) !== false) { $this->stop_words[] = $line[0]; } $row = 0; while (($line = fgetcsv($cluster_text_file, null)) !== false) { if ($row == 0) { $output[] = $line; $row++; continue; } $output[] = $line; $line_cluster_id = intval($ini["line_cluster_id"]); $line_title = intval($ini["line_title"]); $line_abstract = intval($ini["line_abstract"]); if (!isset($line[$line_cluster_id])) { throw new \Exception("Error in line: " . $line[0]); } if (!isset($cluster[$line[$line_cluster_id]])) { $cluster[$line[$line_cluster_id]] = $line[$line_title] . ". " . $line[$line_abstract]; $cluster_details[$line[$line_cluster_id]]["title"] = $line[$line_title] . "."; $cluster_details[$line[$line_cluster_id]]["abstracts"] = $line[$line_abstract]; $counts[$line[$line_cluster_id]] = 1; } else { $cluster[$line[$line_cluster_id]] .= "\n" . $line[$line_title] . ". " . $line[$line_abstract]; $cluster_details[$line[$line_cluster_id]]["title"] .= "\n" . $line[$line_title] . "."; $cluster_details[$line[$line_cluster_id]]["abstracts"] .= "\n" . $line[$line_abstract]; $counts[$line[$line_cluster_id]]++; } } $topics = $this->executeCurl($cluster); $cluster_names = array(); foreach ($cluster as $id => $text) { library\Toolkit::info($text); $categories = array("topics" => array(), "topics_title" => array()); $categories_one = array("topics" => array()); //get 1-grams $response_object_one = $this->getNgrams($cluster[$id], 1); $this->processNgrams($response_object_one, "topics_title", $categories_one, $ini["threshold_single_words"]); //get 2-, 3-, and 4-grams for ($n = 4; $n >= 2; $n--) { $response_object = $this->getNgrams($cluster[$id], $n); $response_object_title = $this->getNgrams($cluster_details[$id]["title"], $n); arsort($response_object); $this->processNgrams($response_object_title, "topics_title", $categories, $ini["threshold_title_ngrams"]); $this->processNgrams($response_object, "topics", $categories, $ini["threshold_title_abstract_ngrams"]); } library\Toolkit::info($id . ": " . print_r($categories, true)); $cluster_names_calais = $topics["calais"][$id]; $cluster_names_zemanta = $topics["zemanta"][$id]; $cluster_name = ""; //Search for 4-, 3-, and 2-title-grams in Calais concepts $cluster_name = $this->compareConcepts($cluster_names_calais, $categories, "topics_title"); //If that fails, search for 4-, 3-, and 2-title-grams in Zemanta concepts if ($cluster_name == "") { $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics_title"); } //If that fails, search for 4-, 3-, and 2-grams in Zemanta concepts if ($cluster_name == "") { $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics"); } //If that fails, search for 1-grams in Zemanta concepts if ($cluster_name == "") { $count_new = 0; $filtered_array = array_filter($categories_one["topics_title"], function ($item) { return !in_array($item, $this->ini_array["naming"]["forbidden_names"]); }); library\Toolkit::info("Filtered Array: " . print_r($filtered_array, true)); foreach ($cluster_names_zemanta["topics_format"] as $name) { $key = array_search($name, $filtered_array); if ($key !== false) { $cluster_name = $cluster_names_zemanta["topics"][$count_new]; break; } $count_new++; } } //If everything above fails, name the cluster after the most important concept //returned by (1) Zemanta or (2) Calais. Finally, name the cluster //"Miscellaneous" if ($cluster_name == "") { if (isset($cluster_names_zemanta["topics"][0])) { $cluster_name = $cluster_names_zemanta["topics"][0]; } elseif (isset($cluster_names_calais["topics"][0])) { $cluster_name = $cluster_names_calais["topics"][0]; } else { $cluster_name = "Miscellaneous"; } } $cluster_id = library\Toolkit::generateUriFromString($cluster_name); $cluster_temp = $cluster_id; $count = 1; foreach ($cluster_names as $attributes) { if ($attributes["uri"] == $cluster_id) { $cluster_id = $cluster_temp . "-" . $count; $count++; } } $cluster_names[$id] = array("name" => $cluster_name, "uri" => $cluster_id); library\Toolkit::info("*** CLUSTER NAME: " . $cluster_name . "\n"); //Write full response for later consultation $this->getFullResponseZemanta($text, $cluster_id, $FULL_ZEMANTA); $this->getFullResponseCalais($text, $cluster_id, $FULL_CALAIS); } //add areas to output array array_push($output[0], "area_uri", "area"); library\Toolkit::info(sizeof($output) . "\n"); $size = sizeof($output); for ($counter = 1; $counter < $size; $counter++) { $cluster_id = $output[$counter][$line_cluster_id]; array_push($output[$counter], $cluster_names[$cluster_id]["uri"], $cluster_names[$cluster_id]["name"]); library\Toolkit::info("{$counter}\n"); } $output_handle = library\Toolkit::openOrCreateFile($OUTPUT_FILE); foreach ($output as $line) { fputcsv($output_handle, $line); } fclose($output_handle); $UNIQUE_ID = $ini_output["unique_id"]; $TITLE = $ini_output["title"]; $persistence = new persistence\SQLitePersistence($ini_connection["sqlite_db"]); $header = array_shift($output); $json_array = array(); foreach ($output as $row) { $json_array[] = array_combine($header, $row); } $json = json_encode($json_array); $persistence->createVisualization($UNIQUE_ID, $TITLE, $json); }