$termArray = new TermArray(); $termArray->addTerm("New", 0); $termArray->addTerm("York", 1); $termArray->addTerm("City", 2); $termArray->addTerm("New York", 0); $termArray->addTerm("York City", 1); $weightHelperResult = $termArray->weightHelper($termArray->getAllTerms()); echo arrayToString($weightHelperResult); //var_dump($termArray->findGroups()); echo "<br>Weights:<br>"; var_dump($termArray->getWeights()); echo "<br>Alternate weights:<br>"; var_dump($termArray->getAlternateWeights()); echo "<br><br>"; echo "Test 10: The, New, York, City, New York, York City, New York City, The New, The New York, The New York City<br>Expected: <br>Actual: "; $termArray = new TermArray(); $termArray->addTerm("The", 0); $termArray->addTerm("New", 1); $termArray->addTerm("York", 2); $termArray->addTerm("City", 3); $termArray->addTerm("The New", 0); $termArray->addTerm("New York", 1); $termArray->addTerm("York City", 2); $termArray->addTerm("New York City", 1); $termArray->addTerm("The New York", 0); $termArray->addTerm("The New York City", 0); $weightHelperResult = $termArray->weightHelper($termArray->getAllTerms()); echo arrayToString($weightHelperResult); echo "<br>Weights:<br>"; var_dump($termArray->getWeights()); echo "<br><br>";
function run($text) { // The following appear in some Wikipedia texts and mess up the parsing: $textsToRemove = ["<br>", "</table>", "</dl>", "</ref>", "<ns>", "</ns>", "<id>", "</id>", "<small>", "<revision>", "<comment>", "</comment>", "<model>", "</model>", "<parentid>", "</parentid>"]; $text = str_replace($textsToRemove, "", $text); $currentDir = getcwd(); // needed for absolute paths to Stanford models $termArray = new TermArray(); $startTime = microtime(true); if (!function_exists("splitIntoWords")) { // The POS and NER taggers need an array of arrays, where each sentence is // it's own array. function splitIntoWords($sentence) { return explode(' ', $sentence); } } $text_arrays = array_map("splitIntoWords", explode('.', $text)); // Send the text to the POS tagger: $pos = new \StanfordNLP\POSTagger($currentDir . '/stanford-postagger-2015-04-20/models/english-left3words-distsim.tagger', $currentDir . '/stanford-postagger-2015-04-20/stanford-postagger.jar'); $startTime = microtime(true); $resultPOS = $pos->batchTag($text_arrays)[0]; $this->times["Run the POS tagger"] = microtime(true) - $startTime; if (printOutput()) { echo "<br>POS results:<br>"; var_dump($resultPOS); echo "<br>"; } if (!$resultPOS) { echo "<br>ERROR: POS tagging failed<br>"; return false; } // Send the text to the NER: $ner = new \StanfordNLP\NERTagger($currentDir . '/stanford-ner-2015-04-20/classifiers/english.all.3class.distsim.crf.ser.gz', $currentDir . '/stanford-ner-2015-04-20/stanford-ner.jar'); $startTime = microtime(true); $resultNER = $ner->batchTag($text_arrays)[0]; $this->times["Run the NER tagger"] = microtime(true) - $startTime; if (printOutput()) { echo "<br>NER results:<br>"; var_dump($resultNER); echo "<br><br>"; } if (!$resultNER) { echo "<br>ERROR: NER tagging failed<br>"; return false; } // Later code assumes that $resultPOS and $resultNER are indexed identically. // I have only seen these errors returned when the text contains something like // "<br>" which is handled differently by each tagger if (count($resultPOS) != count($resultNER)) { echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>"; return false; } $words = []; for ($i = 0, $size = count($resultNER); $i < $size; $i++) { if ($resultPOS[$i][0] != $resultNER[$i][0]) { echo "<br>ERROR: POS and NER tagging are not indexed the same!<br>"; return false; } $words[$i] = $resultNER[$i][0]; } // The next bunch of code loops through the text to find all terms $startTime = microtime(true); $currentStreak = 0; $streakContainsLocation = false; $streakContainsNoun = false; $isAfterPreposition = false; $isAfterConjunction = false; for ($i = 0, $size = count($resultPOS); $i < $size; $i++) { $isNoun = strncmp($resultPOS[$i][1], "NN", 2) == 0; $isAdjectiveOrNumber = (strcmp($resultPOS[$i][1], "CD") == 0 or strcmp($resultPOS[$i][1], "JJ") == 0); // adjective (so "first avenue" would catch the first) $isLocation = strcmp($resultNER[$i][1], "LOCATION") == 0; if ($isNoun or $isAdjectiveOrNumber or $isLocation) { $currentStreak++; if ($isLocation) { $streakContainsLocation = true; } if ($isNoun) { $streakContainsNoun = true; } } else { $streakContainsLocation = false; $streakContainsNoun = false; $currentStreak = 0; $isAfterConjunction = strcmp($resultPOS[$i][1], "CC") == 0; if (!$isAfterConjunction) { // reset $isAfterPreposition only if this is not after a conjuction // that way a text like "near Waterloo and Guelph" will tag both Waterloo // and Guelph as after a preposition $isAfterPreposition = (strcmp($resultPOS[$i][1], "IN") == 0 or strcmp($resultPOS[$i][1], "TO") == 0); if (strcmp($resultPOS[$i][0], "for") == 0) { // TODO: make this case insensitive? $isAfterPreposition = false; } } } if ($streakContainsLocation or $streakContainsNoun) { $phrase = $resultPOS[$i][0]; $subStreakContainsNoun = $isNoun; $subStreakContainsLocation = $isLocation; if ($isNoun or $isLocation) { $newTerm = $termArray->addTerm($phrase, $i); $newTerm->isNoun = $isNoun; $newTerm->isLocation = $isLocation; $newTerm->isAfterPreposition = $isAfterPreposition; } for ($j = 1; $j < $currentStreak; $j++) { $phrase = $resultPOS[$i - $j][0] . ' ' . $phrase; $subStreakContainsNoun = ($subStreakContainsNoun or strncmp($resultPOS[$i - $j][1], "NN", 2) == 0); $subStreakContainsLocation = ($subStreakContainsLocation or strcmp($resultNER[$i - $j][1], "LOCATION") == 0); if ($subStreakContainsLocation or $subStreakContainsNoun) { $newTerm = $termArray->addTerm($phrase, $i - $j); $newTerm->isNoun = $subStreakContainsNoun; $newTerm->isLocation = $subStreakContainsLocation; $newTerm->isAfterPreposition = $isAfterPreposition; } } } } $this->times["Loop through text to find locations"] = microtime(true) - $startTime; // Now we remove some terms from the array: $startTime = microtime(true); if ($termArray->terms) { if ($termArray->countLocations() > 0) { if (printOutput()) { echo "<br>This text contains words tagged as locations, so we will only consider those words.<br>"; } $termArray->removeNouns(false); $this->termTypeUsed = "Locations"; } else { if (printOutput()) { echo "<br>This text does not contain words tagged as locations, so we must only use nouns.<br>"; } if ($termArray->countNounsAfterPrepositions() > 0) { if (printOutput()) { echo "Some nouns occured after prepositions, so we will only use those.<br>"; } $termArray->removeNouns(true); $this->termTypeUsed = "NounsAfterPrep"; } else { $this->termTypeUsed = "Nouns"; } } } else { echo "Warning: no nouns or locations found in text."; $this->termTypeUsed = "None"; } $this->times["Filter terms that are found"] = microtime(true) - $startTime; // the rest of this code deals with postal codes $startTime = microtime(true); $CanadaPostCodes = []; $USZipCodes = []; $DutchPostCodes = []; preg_match_all('/\\b[a-zA-Z][0-9][a-zA-Z][\\s]?[0-9][a-zA-Z][0-9]\\b/', $text, $CanadaPostCodes); preg_match_all('/\\b[0-9]{5}([\\s\\-][0-9]{4})?\\b/', $text, $USZipCodes); preg_match_all('/\\b[0-9]{4}[\\s]?[a-zA-Z]{2}\\b/', $text, $DutchPostCodes); $this->times["Find postal codes in the text"] = microtime(true) - $startTime; $startTime = microtime(true); foreach ($CanadaPostCodes[0] as $postcode) { if (isset($termArray->terms[$postcode])) { // if the postcode already got in another way, we don't add it again foreach ($termArray->terms[$postcode] as $term) { $term->isPostcode = true; $term->postcodeCountry = "ca"; } } else { $positions = [-10]; $postcodeWords = explode(" ", $postcode); // all post codes have 1 or 2 words $firstWordPositions = array_keys($words, $postcodeWords[0]); if (count($postcodeWords) == 1) { $positions = $firstWordPositions; } else { foreach ($firstWordPositions as $firstWordPosition) { if ($words[$firstWordPosition + 1] == $postcodeWords[1]) { if ($positions == [-10]) { $positions = [$firstWordPosition]; } else { $postions[] = $firstWordPosition; } } } } foreach ($positions as $postcodePosition) { $newTerm = $termArray->addTerm($postcode, $postcodePosition); $newTerm->isPostcode = true; $newTerm->postcodeCountry = "ca"; } } } foreach ($USZipCodes[0] as $postcode) { if (isset($termArray->terms[$postcode])) { // if the postcode already got in another way, we don't add it again foreach ($termArray->terms[$postcode] as $term) { $term->isPostcode = true; $term->postcodeCountry = "us"; } } else { $positions = [-10]; $postcodeWords = explode(" ", $postcode); // all post codes have 1 or 2 words $firstWordPositions = array_keys($words, $postcodeWords[0]); if (count($postcodeWords) == 1) { $positions = $firstWordPositions; } else { foreach ($firstWordPositions as $firstWordPosition) { if ($words[$firstWordPosition + 1] == $postcodeWords[1]) { if ($positions == [-10]) { $positions = [$firstWordPosition]; } else { $postions[] = $firstWordPosition; } } } } foreach ($positions as $postcodePosition) { $newTerm = $termArray->addTerm($postcode, $postcodePosition); $newTerm->isPostcode = true; $newTerm->postcodeCountry = "us"; } } } foreach ($DutchPostCodes[0] as $postcode) { if (isset($termArray->terms[$postcode])) { // if the postcode already got in another way, we don't add it again foreach ($termArray->terms[$postcode] as $term) { $term->isPostcode = true; $term->postcodeCountry = "nl"; } } else { $positions = [-10]; $postcodeWords = explode(" ", $postcode); // all post codes have 1 or 2 words $firstWordPositions = array_keys($words, $postcodeWords[0]); if (count($postcodeWords) == 1) { $positions = $firstWordPositions; } else { foreach ($firstWordPositions as $firstWordPosition) { if ($words[$firstWordPosition + 1] == $postcodeWords[1]) { if ($positions == [-10]) { $positions = [$firstWordPosition]; } else { $postions[] = $firstWordPosition; } } } } foreach ($positions as $postcodePosition) { $newTerm = $termArray->addTerm($postcode, $postcodePosition); $newTerm->isPostcode = true; $newTerm->postcodeCountry = "nl"; } } } $this->times["Update metadata for post codes"] = microtime(true) - $startTime; return $termArray; }