Exemplo n.º 1
0
<?php

namespace headstart;

require_once "autoload.inc.php";
use headstart\persistence;
use headstart\library;
$INI_DIR = dirname(__FILE__) . "/../../preprocessing/conf/";
$ini_array = library\Toolkit::loadIni($INI_DIR);
$persistence = new persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]);
//$persistence->createTables();
//$persistence->createVisualization("test_id", "My Test Vis", "{[some json string;]}");
echo $persistence->getLastVersion("vis_id2");
//$persistence->writeRevision("test_id", "{[some other json string;]}");
//echo $persistence->getLastVersion("test_id");
Exemplo n.º 2
0
 public function performNaming($working_dir)
 {
     $ini = $this->ini_array["naming"];
     $ini_general = $this->ini_array["general"];
     $ini_output = $this->ini_array["output"];
     $ini_connection = $this->ini_array["connection"];
     $WORKING_DIR = $working_dir;
     //Output of scaling and clustering script
     $CLUSTERS = $WORKING_DIR . $ini_output["output_scaling_clustering"];
     //Output file
     $OUTPUT_FILE = $WORKING_DIR . $ini_output["output_naming"];
     //Output file for the full API responses
     $FULL_ZEMANTA = $WORKING_DIR . "full_responses/zemanta/";
     $FULL_CALAIS = $WORKING_DIR . "full_responses/calais/";
     $cluster = array();
     $cluster_details = array("title" => array(), "abstracts" => array());
     $counts = array();
     $stop_words = array();
     $output = array();
     $cluster_text_file = library\Toolkit::openFileForReading($CLUSTERS);
     $stop_words_file = library\Toolkit::openFileForReading($ini_general["preprocessing_dir"] . $ini["stop_words"]);
     while (($line = fgetcsv($stop_words_file, null, "\t")) !== false) {
         $this->stop_words[] = $line[0];
     }
     $row = 0;
     while (($line = fgetcsv($cluster_text_file, null)) !== false) {
         if ($row == 0) {
             $output[] = $line;
             $row++;
             continue;
         }
         $output[] = $line;
         $line_cluster_id = intval($ini["line_cluster_id"]);
         $line_title = intval($ini["line_title"]);
         $line_abstract = intval($ini["line_abstract"]);
         if (!isset($line[$line_cluster_id])) {
             throw new \Exception("Error in line: " . $line[0]);
         }
         if (!isset($cluster[$line[$line_cluster_id]])) {
             $cluster[$line[$line_cluster_id]] = $line[$line_title] . ". " . $line[$line_abstract];
             $cluster_details[$line[$line_cluster_id]]["title"] = $line[$line_title] . ".";
             $cluster_details[$line[$line_cluster_id]]["abstracts"] = $line[$line_abstract];
             $counts[$line[$line_cluster_id]] = 1;
         } else {
             $cluster[$line[$line_cluster_id]] .= "\n" . $line[$line_title] . ". " . $line[$line_abstract];
             $cluster_details[$line[$line_cluster_id]]["title"] .= "\n" . $line[$line_title] . ".";
             $cluster_details[$line[$line_cluster_id]]["abstracts"] .= "\n" . $line[$line_abstract];
             $counts[$line[$line_cluster_id]]++;
         }
     }
     $topics = $this->executeCurl($cluster);
     $cluster_names = array();
     foreach ($cluster as $id => $text) {
         library\Toolkit::info($text);
         $categories = array("topics" => array(), "topics_title" => array());
         $categories_one = array("topics" => array());
         //get 1-grams
         $response_object_one = $this->getNgrams($cluster[$id], 1);
         $this->processNgrams($response_object_one, "topics_title", $categories_one, $ini["threshold_single_words"]);
         //get 2-, 3-, and 4-grams
         for ($n = 4; $n >= 2; $n--) {
             $response_object = $this->getNgrams($cluster[$id], $n);
             $response_object_title = $this->getNgrams($cluster_details[$id]["title"], $n);
             arsort($response_object);
             $this->processNgrams($response_object_title, "topics_title", $categories, $ini["threshold_title_ngrams"]);
             $this->processNgrams($response_object, "topics", $categories, $ini["threshold_title_abstract_ngrams"]);
         }
         library\Toolkit::info($id . ": " . print_r($categories, true));
         $cluster_names_calais = $topics["calais"][$id];
         $cluster_names_zemanta = $topics["zemanta"][$id];
         $cluster_name = "";
         //Search for 4-, 3-, and 2-title-grams in Calais concepts
         $cluster_name = $this->compareConcepts($cluster_names_calais, $categories, "topics_title");
         //If that fails, search for 4-, 3-, and 2-title-grams in Zemanta concepts
         if ($cluster_name == "") {
             $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics_title");
         }
         //If that fails, search for 4-, 3-, and 2-grams in Zemanta concepts
         if ($cluster_name == "") {
             $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics");
         }
         //If that fails, search for 1-grams in Zemanta concepts
         if ($cluster_name == "") {
             $count_new = 0;
             $filtered_array = array_filter($categories_one["topics_title"], function ($item) {
                 return !in_array($item, $this->ini_array["naming"]["forbidden_names"]);
             });
             library\Toolkit::info("Filtered Array: " . print_r($filtered_array, true));
             foreach ($cluster_names_zemanta["topics_format"] as $name) {
                 $key = array_search($name, $filtered_array);
                 if ($key !== false) {
                     $cluster_name = $cluster_names_zemanta["topics"][$count_new];
                     break;
                 }
                 $count_new++;
             }
         }
         //If everything above fails, name the cluster after the most important concept
         //returned by (1) Zemanta or (2) Calais. Finally, name the cluster
         //"Miscellaneous"
         if ($cluster_name == "") {
             if (isset($cluster_names_zemanta["topics"][0])) {
                 $cluster_name = $cluster_names_zemanta["topics"][0];
             } elseif (isset($cluster_names_calais["topics"][0])) {
                 $cluster_name = $cluster_names_calais["topics"][0];
             } else {
                 $cluster_name = "Miscellaneous";
             }
         }
         $cluster_id = library\Toolkit::generateUriFromString($cluster_name);
         $cluster_temp = $cluster_id;
         $count = 1;
         foreach ($cluster_names as $attributes) {
             if ($attributes["uri"] == $cluster_id) {
                 $cluster_id = $cluster_temp . "-" . $count;
                 $count++;
             }
         }
         $cluster_names[$id] = array("name" => $cluster_name, "uri" => $cluster_id);
         library\Toolkit::info("*** CLUSTER NAME: " . $cluster_name . "\n");
         //Write full response for later consultation
         $this->getFullResponseZemanta($text, $cluster_id, $FULL_ZEMANTA);
         $this->getFullResponseCalais($text, $cluster_id, $FULL_CALAIS);
     }
     //add areas to output array
     array_push($output[0], "area_uri", "area");
     library\Toolkit::info(sizeof($output) . "\n");
     $size = sizeof($output);
     for ($counter = 1; $counter < $size; $counter++) {
         $cluster_id = $output[$counter][$line_cluster_id];
         array_push($output[$counter], $cluster_names[$cluster_id]["uri"], $cluster_names[$cluster_id]["name"]);
         library\Toolkit::info("{$counter}\n");
     }
     $output_handle = library\Toolkit::openOrCreateFile($OUTPUT_FILE);
     foreach ($output as $line) {
         fputcsv($output_handle, $line);
     }
     fclose($output_handle);
     $UNIQUE_ID = $ini_output["unique_id"];
     $TITLE = $ini_output["title"];
     $persistence = new persistence\SQLitePersistence($ini_connection["sqlite_db"]);
     $header = array_shift($output);
     $json_array = array();
     foreach ($output as $row) {
         $json_array[] = array_combine($header, $row);
     }
     $json = json_encode($json_array);
     $persistence->createVisualization($UNIQUE_ID, $TITLE, $json);
 }