コード例 #1
0
ファイル: TestTerms.php プロジェクト: spotzi/Geotagger
$termArray = new TermArray();
$termArray->addTerm("New", 0);
$termArray->addTerm("York", 1);
$termArray->addTerm("City", 2);
$termArray->addTerm("New York", 0);
$termArray->addTerm("York City", 1);
$weightHelperResult = $termArray->weightHelper($termArray->getAllTerms());
echo arrayToString($weightHelperResult);
//var_dump($termArray->findGroups());
echo "<br>Weights:<br>";
var_dump($termArray->getWeights());
echo "<br>Alternate weights:<br>";
var_dump($termArray->getAlternateWeights());
echo "<br><br>";
echo "Test 10: The, New, York, City, New York, York City, New York City, The New, The New York, The New York City<br>Expected: <br>Actual: ";
$termArray = new TermArray();
$termArray->addTerm("The", 0);
$termArray->addTerm("New", 1);
$termArray->addTerm("York", 2);
$termArray->addTerm("City", 3);
$termArray->addTerm("The New", 0);
$termArray->addTerm("New York", 1);
$termArray->addTerm("York City", 2);
$termArray->addTerm("New York City", 1);
$termArray->addTerm("The New York", 0);
$termArray->addTerm("The New York City", 0);
$weightHelperResult = $termArray->weightHelper($termArray->getAllTerms());
echo arrayToString($weightHelperResult);
echo "<br>Weights:<br>";
var_dump($termArray->getWeights());
echo "<br><br>";
コード例 #2
0
ファイル: TextParser.php プロジェクト: spotzi/Geotagger
 function run($text)
 {
     // The following appear in some Wikipedia texts and mess up the parsing:
     $textsToRemove = ["<br>", "</table>", "</dl>", "</ref>", "<ns>", "</ns>", "<id>", "</id>", "<small>", "<revision>", "<comment>", "</comment>", "<model>", "</model>", "<parentid>", "</parentid>"];
     $text = str_replace($textsToRemove, "", $text);
     $currentDir = getcwd();
     // needed for absolute paths to Stanford models
     $termArray = new TermArray();
     $startTime = microtime(true);
     if (!function_exists("splitIntoWords")) {
         // The POS and NER taggers need an array of arrays, where each sentence is
         // it's own array.
         function splitIntoWords($sentence)
         {
             return explode(' ', $sentence);
         }
     }
     $text_arrays = array_map("splitIntoWords", explode('.', $text));
     // Send the text to the POS tagger:
     $pos = new \StanfordNLP\POSTagger($currentDir . '/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger', $currentDir . '/stanford-postagger-2015-04-20/stanford-postagger.jar');
     $startTime = microtime(true);
     $resultPOS = $pos->batchTag($text_arrays)[0];
     $this->times["Run the POS tagger"] = microtime(true) - $startTime;
     if (printOutput()) {
         echo "<br>POS results:<br>";
         var_dump($resultPOS);
         echo "<br>";
     }
     if (!$resultPOS) {
         echo "<br>ERROR: POS tagging failed<br>";
         return false;
     }
     // Send the text to the NER:
     $ner = new \StanfordNLP\NERTagger($currentDir . '/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz', $currentDir . '/stanford-ner-2015-04-20/stanford-ner.jar');
     $startTime = microtime(true);
     $resultNER = $ner->batchTag($text_arrays)[0];
     $this->times["Run the NER tagger"] = microtime(true) - $startTime;
     if (printOutput()) {
         echo "<br>NER results:<br>";
         var_dump($resultNER);
         echo "<br><br>";
     }
     if (!$resultNER) {
         echo "<br>ERROR: NER tagging failed<br>";
         return false;
     }
     // Later code assumes that $resultPOS and $resultNER are indexed identically.
     // I have only seen these errors returned when the text contains something like
     // "<br>" which is handled differently by each tagger
     if (count($resultPOS) != count($resultNER)) {
         echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>";
         return false;
     }
     $words = [];
     for ($i = 0, $size = count($resultNER); $i < $size; $i++) {
         if ($resultPOS[$i][0] != $resultNER[$i][0]) {
             echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>";
             return false;
         }
         $words[$i] = $resultNER[$i][0];
     }
     // The next bunch of code loops through the text to find all terms
     $startTime = microtime(true);
     $currentStreak = 0;
     $streakContainsLocation = false;
     $streakContainsNoun = false;
     $isAfterPreposition = false;
     $isAfterConjunction = false;
     for ($i = 0, $size = count($resultPOS); $i < $size; $i++) {
         $isNoun = strncmp($resultPOS[$i][1], "NN", 2) == 0;
         $isAdjectiveOrNumber = (strcmp($resultPOS[$i][1], "CD") == 0 or strcmp($resultPOS[$i][1], "JJ") == 0);
         // adjective (so "first avenue" would catch the first)
         $isLocation = strcmp($resultNER[$i][1], "LOCATION") == 0;
         if ($isNoun or $isAdjectiveOrNumber or $isLocation) {
             $currentStreak++;
             if ($isLocation) {
                 $streakContainsLocation = true;
             }
             if ($isNoun) {
                 $streakContainsNoun = true;
             }
         } else {
             $streakContainsLocation = false;
             $streakContainsNoun = false;
             $currentStreak = 0;
             $isAfterConjunction = strcmp($resultPOS[$i][1], "CC") == 0;
             if (!$isAfterConjunction) {
                 // reset $isAfterPreposition only if this is not after a conjuction
                 // that way a text like "near Waterloo and Guelph" will tag both Waterloo
                 // and Guelph as after a preposition
                 $isAfterPreposition = (strcmp($resultPOS[$i][1], "IN") == 0 or strcmp($resultPOS[$i][1], "TO") == 0);
                 if (strcmp($resultPOS[$i][0], "for") == 0) {
                     // TODO: make this case insensitive?
                     $isAfterPreposition = false;
                 }
             }
         }
         if ($streakContainsLocation or $streakContainsNoun) {
             $phrase = $resultPOS[$i][0];
             $subStreakContainsNoun = $isNoun;
             $subStreakContainsLocation = $isLocation;
             if ($isNoun or $isLocation) {
                 $newTerm = $termArray->addTerm($phrase, $i);
                 $newTerm->isNoun = $isNoun;
                 $newTerm->isLocation = $isLocation;
                 $newTerm->isAfterPreposition = $isAfterPreposition;
             }
             for ($j = 1; $j < $currentStreak; $j++) {
                 $phrase = $resultPOS[$i - $j][0] . ' ' . $phrase;
                 $subStreakContainsNoun = ($subStreakContainsNoun or strncmp($resultPOS[$i - $j][1], "NN", 2) == 0);
                 $subStreakContainsLocation = ($subStreakContainsLocation or strcmp($resultNER[$i - $j][1], "LOCATION") == 0);
                 if ($subStreakContainsLocation or $subStreakContainsNoun) {
                     $newTerm = $termArray->addTerm($phrase, $i - $j);
                     $newTerm->isNoun = $subStreakContainsNoun;
                     $newTerm->isLocation = $subStreakContainsLocation;
                     $newTerm->isAfterPreposition = $isAfterPreposition;
                 }
             }
         }
     }
     $this->times["Loop through text to find locations"] = microtime(true) - $startTime;
     // Now we remove some terms from the array:
     $startTime = microtime(true);
     if ($termArray->terms) {
         if ($termArray->countLocations() > 0) {
             if (printOutput()) {
                 echo "<br>This text contains words tagged as locations, so we will only consider those words.<br>";
             }
             $termArray->removeNouns(false);
             $this->termTypeUsed = "Locations";
         } else {
             if (printOutput()) {
                 echo "<br>This text does not contain words tagged as locations, so we must only use nouns.<br>";
             }
             if ($termArray->countNounsAfterPrepositions() > 0) {
                 if (printOutput()) {
                     echo "Some nouns occured after prepositions, so we will only use those.<br>";
                 }
                 $termArray->removeNouns(true);
                 $this->termTypeUsed = "NounsAfterPrep";
             } else {
                 $this->termTypeUsed = "Nouns";
             }
         }
     } else {
         echo "Warning: no nouns or locations found in text.";
         $this->termTypeUsed = "None";
     }
     $this->times["Filter terms that are found"] = microtime(true) - $startTime;
     // the rest of this code deals with postal codes
     $startTime = microtime(true);
     $CanadaPostCodes = [];
     $USZipCodes = [];
     $DutchPostCodes = [];
     preg_match_all('/\\b[a-zA-Z][0-9][a-zA-Z][\\s]?[0-9][a-zA-Z][0-9]\\b/', $text, $CanadaPostCodes);
     preg_match_all('/\\b[0-9]{5}([\\s\\-][0-9]{4})?\\b/', $text, $USZipCodes);
     preg_match_all('/\\b[0-9]{4}[\\s]?[a-zA-Z]{2}\\b/', $text, $DutchPostCodes);
     $this->times["Find postal codes in the text"] = microtime(true) - $startTime;
     $startTime = microtime(true);
     foreach ($CanadaPostCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "ca";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "ca";
             }
         }
     }
     foreach ($USZipCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "us";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "us";
             }
         }
     }
     foreach ($DutchPostCodes[0] as $postcode) {
         if (isset($termArray->terms[$postcode])) {
             // if the postcode already got in another way, we don't add it again
             foreach ($termArray->terms[$postcode] as $term) {
                 $term->isPostcode = true;
                 $term->postcodeCountry = "nl";
             }
         } else {
             $positions = [-10];
             $postcodeWords = explode(" ", $postcode);
             // all post codes have 1 or 2 words
             $firstWordPositions = array_keys($words, $postcodeWords[0]);
             if (count($postcodeWords) == 1) {
                 $positions = $firstWordPositions;
             } else {
                 foreach ($firstWordPositions as $firstWordPosition) {
                     if ($words[$firstWordPosition + 1] == $postcodeWords[1]) {
                         if ($positions == [-10]) {
                             $positions = [$firstWordPosition];
                         } else {
                             $postions[] = $firstWordPosition;
                         }
                     }
                 }
             }
             foreach ($positions as $postcodePosition) {
                 $newTerm = $termArray->addTerm($postcode, $postcodePosition);
                 $newTerm->isPostcode = true;
                 $newTerm->postcodeCountry = "nl";
             }
         }
     }
     $this->times["Update metadata for post codes"] = microtime(true) - $startTime;
     return $termArray;
 }