/** * {@inheritdoc} */ protected function call(Lexer $lexer) { while (in_array($lexer->peek(), self::CHARS, true)) { $lexer->next(); } $lexer->emit(new WhitespaceToken()); return new TextState(); }
public function testAbbreviations() { $text = 'Hello Mr. Jones, please turn on the T.V.'; $expected = ['T_CAPITALIZED_WORD<"Hello">', 'T_CAPITALIZED_WORD<"Mr">', 'T_PERIOD<".">', 'T_CAPITALIZED_WORD<"Jones,">', 'T_WORD<"please">', 'T_WORD<"turn">', 'T_WORD<"on">', 'T_WORD<"the">', 'T_CAPITALIZED_WORD<"T">', 'T_PERIOD<".">', 'T_CAPITALIZED_WORD<"V">', 'T_PERIOD<".">', 'T_EOF']; $lexer = new Lexer(); $tokens = $lexer->run($text); $actual = $this->getTokensString($tokens); $this->assertEquals($expected, $actual); }
/** * {@inheritdoc} */ protected function call(Lexer $lexer) { while (true) { $peek = $lexer->peek(); //file_put_contents(__DIR__ . '/foo.log', '#' . $lexer->pos() . ' ' . $peek . ' (' . $lexer->getTokenValue() . ')' . PHP_EOL, FILE_APPEND); if ($peek === null) { $lexer->emit(new EOFToken()); return; } if ('.' === $peek) { $lexer->next(); $lexer->emit(new PeriodToken()); continue; } if ('?' === $peek) { $lexer->next(); $lexer->emit(new QuestionMarkToken()); continue; } if ('!' === $peek) { $lexer->next(); $lexer->emit(new ExclamationPointToken()); continue; } if (in_array($peek, QuotedStringState::CHARS, true)) { return new QuotedStringState(); } if (in_array($peek, WhitespaceState::CHARS, true)) { return new WhitespaceState(); } return new WordState(); } }
/** * {@inheritdoc} */ protected function call(Lexer $lexer) { $start = $lexer->next(); while (true) { $next = $lexer->next(); if ($next === null) { throw new StateException('Failed to find end of quote. Reached end of input. Read: ' . $lexer->getTokenValue()); } if ($start === $next) { break; } } $lexer->emit(new QuotedStringToken()); return new TextState(); }
/** * @param string $text * * @return string[] */ public function split($text) { $this->probabilityCalculator->setAbbreviations($this->getAbbreviations()); $tokens = $this->lexer->run($text); $probabilities = $this->probabilityCalculator->calculate($tokens); $sentences = $this->sentenceBuilder->build($probabilities); return $sentences; }
/** * @param string $input * @param array $expectedResult * @param array $abbreviations */ private function runCalculateTest($input, array $expectedResult, array $abbreviations) { $lexer = new Lexer(); $tokens = $lexer->run($input); $rules = IniConfiguration::loadFile(__DIR__ . '/../../rules/rules.ini')->getRules(); $calc = new ProbabilityCalculator($rules); $calc->setAbbreviations(new Abbreviations($abbreviations)); $probabilities = $calc->calculate($tokens); $actual = []; foreach ($probabilities as $probability) { $token = $probability->getToken(); if ($token instanceof WordToken || $token instanceof CapitalizedWordToken || $token instanceof WhitespaceToken || $token instanceof EOFToken) { continue; } $actual[] = $token->getName() . ' ' . $probability->getProbability(); } $this->assertEquals($expectedResult, $actual); }
/** * {@inheritdoc} */ protected function call(Lexer $lexer) { $nonWordChars = $this->getNonWordChars(); while (!in_array($lexer->peek(), $nonWordChars, true)) { $lexer->next(); } $value = $lexer->getTokenValue(); $firstChar = substr($value, 0, 1); if (ctype_upper($firstChar)) { $lexer->emit(new CapitalizedWordToken()); } else { $lexer->emit(new WordToken()); } return new TextState(); }