Пример #1
0
function tag_the_content($the_text)
{
    $time_start = microtime(true);
    // sets DIR path variable
    $dir = dirname(__FILE__);
    // loads tagger
    include $dir . '/PHP-Stanford-NLP/autoload.php';
    // creates tagger
    $pos = new \StanfordNLP\POSTagger($dir . '/PHP-Stanford-NLP/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger', $dir . '/PHP-Stanford-NLP/stanford-postagger-2015-04-20/stanford-postagger.jar');
    // calls tagger to tag the_content
    // $result = $pos->tag(explode(' ', get_the_content() )); //  *** change back to this in production ***
    $result = $pos->tag(explode(' ', $the_text));
    // echo json_encode($result);
    return $result;
}
Пример #2
0
<input type="submit" onclick="setValue();">
</form>

<script>
function setValue(){
	document.form.my_text.value = final_transcript;
	document.forms["form"].submit();
}
</script>

<?php 
$mySentence = $_POST['my_text'];
//Store the stated sentence
$query = "";
//Store the topic of the query
$pos = new \StanfordNLP\POSTagger('./models/english-left3words-distsim.tagger', './stanford-postagger.jar');
$ner = new \StanfordNLP\NERTagger('./classifiers/english.all.3class.distsim.crf.ser.gz', './stanford-ner.jar');
//Used for tagging the topic as either a person, organization or location
$curl = curl_init();
$i = 0;
//Used to navigate through loops
$valid = false;
//Used to check if a stated sentence is valid, defaults as invalid
//EXTRACT SPOKEN WORDS
if (substr($mySentence, 0, 8) === "What is ") {
    $valid = true;
    $position = strpos($mySentence, "is ");
    $query = substr($mySentence, $position + strlen("is "));
    $query = ucwords($query);
    //echo $query;
    $_SESSION["query"] = $query;
Пример #3
0
 function run($text)
 {
     // The following appear in some Wikipedia texts and mess up the parsing:
     $textsToRemove = ["<br>", "</table>", "</dl>", "</ref>", "<ns>", "</ns>", "<id>", "</id>", "<small>", "<revision>", "<comment>", "</comment>", "<model>", "</model>", "<parentid>", "</parentid>"];
     $text = str_replace($textsToRemove, "", $text);
     $currentDir = getcwd();
     // needed for absolute paths to Stanford models
     $termArray = new TermArray();
     $startTime = microtime(true);
     if (!function_exists("splitIntoWords")) {
         // The POS and NER taggers need an array of arrays, where each sentence is
         // it's own array.
         function splitIntoWords($sentence)
         {
             return explode(' ', $sentence);
         }
     }
     $text_arrays = array_map("splitIntoWords", explode('.', $text));
     // Send the text to the POS tagger:
     $pos = new \StanfordNLP\POSTagger($currentDir . '/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger', $currentDir . '/stanford-postagger-2015-04-20/stanford-postagger.jar');
     $startTime = microtime(true);
     $resultPOS = $pos->batchTag($text_arrays)[0];
     $this->times["Run the POS tagger"] = microtime(true) - $startTime;
     if (printOutput()) {
         echo "<br>POS results:<br>";
         var_dump($resultPOS);
         echo "<br>";
     }
     if (!$resultPOS) {
         echo "<br>ERROR: POS tagging failed<br>";
         return false;
     }
     // Send the text to the NER:
     $ner = new \StanfordNLP\NERTagger($currentDir . '/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz', $currentDir . '/stanford-ner-2015-04-20/stanford-ner.jar');
     $startTime = microtime(true);
     $resultNER = $ner->batchTag($text_arrays)[0];
     $this->times["Run the NER tagger"] = microtime(true) - $startTime;
     if (printOutput()) {
         echo "<br>NER results:<br>";
         var_dump($resultNER);
         echo "<br><br>";
     }
     if (!$resultNER) {
         echo "<br>ERROR: NER tagging failed<br>";
         return false;
     }
     // Later code assumes that $resultPOS and $resultNER are indexed identically.
     // I have only seen these errors returned when the text contains something like
     // "<br>" which is handled differently by each tagger
     if (count($resultPOS) != count($resultNER)) {
         echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>";
         return false;
     }
     $words = [];
     for ($i = 0, $size = count($resultNER); $i < $size; $i++) {
         if ($resultPOS[$i][0] != $resultNER[$i][0]) {
             echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>";
             return false;
         }
         $words[$i] = $resultNER[$i][0];
     }
     // The next bunch of code loops through the text to find all terms
     $startTime = microtime(true);
     $currentStreak = 0;
     $streakContainsLocation = false;
     $streakContainsNoun = false;
     $isAfterPreposition = false;
     $isAfterConjunction = false;
     for ($i = 0, $size = count($resultPOS); $i < $size; $i++) {
         $isNoun = strncmp($resultPOS[$i][1], "NN", 2) == 0;
         $isAdjectiveOrNumber = (strcmp($resultPOS[$i][1], "CD") == 0 or strcmp($resultPOS[$i][1], "JJ") == 0);
         // adjective (so "first avenue" would catch the first)
         $isLocation = strcmp($resultNER[$i][1], "LOCATION") == 0;
         if ($isNoun or $isAdjectiveOrNumber or $isLocation) {
             $currentStreak++;
             if ($isLocation) {
                 $streakContainsLocation = true;
             }
             if ($isNoun) {
                 $streakContainsNoun = true;
             }
         } else {
             $streakContainsLocation = false;
             $streakContainsNoun = false;
             $currentStreak = 0;
             $isAfterConjunction = strcmp($resultPOS[$i][1], "CC") == 0;
             if (!$isAfterConjunction) {
                 // reset $isAfterPreposition only if this is not after a conjuction
                 // that way a text like "near Waterloo and Guelph" will tag both Waterloo
                 // and Guelph as after a preposition
                 $isAfterPreposition = (strcmp($resultPOS[$i][1], "IN") == 0 or strcmp($resultPOS[$i][1], "TO") == 0);
                 if (strcmp($resultPOS[$i][0], "for") == 0) {
                     // TODO: make this case insensitive?
                     $isAfterPreposition = false;
                 }
             }
         }
         if ($streakContainsLocation or $streakContainsNoun) {
             $phrase = $resultPOS[$i][0];
             $subStreakContainsNoun = $isNoun;
             $subStreakContainsLocation = $isLocation;
             if ($isNoun or $isLocation) {
                 $newTerm = $termArray->addTerm($phrase, $i);
                 $newTerm->isNoun = $isNoun;
                 $newTerm->isLocation = $isLocation;
                 $newTerm->isAfterPreposition = $isAfterPreposition;
             }
             for ($j = 1; $j < $currentStreak; $j++) {
                 $phrase = $resultPOS[$i - $j][0] . ' ' . $phrase;
                 $subStreakContainsNoun = ($subStreakContainsNoun or strncmp($resultPOS[$i - $j][1], "NN", 2) == 0);
                 $subStreakContainsLocation = ($subStreakContainsLocation or strcmp($resultNER[$i - $j][1], "LOCATION") == 0);
                 if ($subStreakContainsLocation or $subStreakContainsNoun) {
                     $newTerm = $termArray->addTerm($phrase, $i - $j);
                     $newTerm->isNoun = $subStreakContainsNoun;
                     $newTerm->isLocation = $subStreakContainsLocation;
                     $newTerm->isAfterPreposition = $isAfterPreposition;
                 }
             }
         }
     }
     $this->times["Loop through text to find locations"] = microtime(true) - $startTime;
     // Now we remove some terms from the array:
     $startTime = microtime(true);
     if ($termArray->terms) {
         if ($termArray->countLocations() > 0) {
             if (printOutput()) {
                 echo "<br>This text contains words tagged as locations, so we will only consider those words.<br>";
             }
             $termArray->removeNouns(false);
             $this->termTypeUsed = "Locations";
         } else {
             if (printOutput()) {
                 echo "<br>This text does not contain words tagged as locations, so we must only use nouns.<br>";
             }
             if ($termArray->countNounsAfterPrepositions() > 0) {
                 if (printOutput()) {
                     echo "Some nouns occured after prepositions, so we will only use those.<br>";
                 }
                 $termArray->removeNouns(true);
                 $this->termTypeUsed = "NounsAfterPrep";
             } else {
                 $this->termTypeUsed = "Nouns";
             }
         }
     } else {
         echo "Warning: no nouns or locations found in text.";
         $this->termTypeUsed = "None";
     }
     $this->times["Filter terms that are found"] = microtime(true) - $startTime;
     // the rest of this code deals with postal codes
     $startTime = microtime(true);
     $CanadaPostCodes = [];
     $USZipCodes = [];
     $DutchPostCodes = [];
     preg_match_all('/\\b[a-zA-Z][0-9][a-zA-Z][\\s]?[0-9][a-zA-Z][0-9]\\b/', $text, $CanadaPostCodes);
     preg_match_all('/\\b[0-9]{5}([\\s\\-][0-9]{4})?\\b/', $text, $USZipCodes);
     preg_match_all('/\\b[0-9]{4}[\\s]?[a-zA-Z]{2}\\b/', $text, $DutchPostCodes);
     $this->times["Find postal codes in the text"] = microtime(true) - $startTime;
     $startTime = microtime(true);
     foreach ($CanadaPostCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "ca";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "ca";
             }
         }
     }
     foreach ($USZipCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "us";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "us";
             }
         }
     }
     foreach ($DutchPostCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "nl";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "nl";
             }
         }
     }
     $this->times["Update metadata for post codes"] = microtime(true) - $startTime;
     return $termArray;
 }