public function testOne() { $input = <<<'EOD' \s \nd UNkulunkulu\nd* u\add ba\add*xwayisa ngokulunga okungokwabantu 文字ab化け \s Ukulunga okuku\nd Kristu\nd* אבabגד kuyinzuzo אבגד ab EOD; $output = Filter_Hyphenate::atTransition($this->firstset, $this->secondset, $input); $standard = <<<'EOD' \s \nd UNkulunkulu\nd* u\add ba\add*xwayisa ngokulunga okungokwabantu 文字ab化け \s Ukulunga okuku\nd Kristu\nd* אבabגד kuyinzuzo אבגד ab EOD; $this->assertEquals($standard, $output); }
$database_logs = Database_Logs::getInstance(); $database_books = Database_Books::getInstance(); $database_config_bible = Database_Config_Bible::getInstance(); $database_bibles = Database_Bibles::getInstance(); $database_users = Database_Users::getInstance(); $inputBible = Filter_Cli::argument(@$argv, 1); $outputBible = "{$inputBible}-hyphenated"; $user = Filter_Cli::argument(@$argv, 2); $database_logs->log("Reading Bible {$inputBible}, adding soft hyphens, putting it into Bible {$outputBible}"); // Get the two sets of characters as arrays. // The /u switch treats the text as UTF8 Unicode. preg_match_all('/./u', $database_config_bible->getHyphenationFirstSet($inputBible), $firstset); $firstset = $firstset[0]; preg_match_all('/./u', $database_config_bible->getHyphenationSecondSet($inputBible), $secondset); $secondset = $secondset[0]; // Delete and (re)create the hyphenated Bible, and grant privileges. $database_bibles->deleteBible($outputBible); $database_bibles->createBible($outputBible); $database_users->grantAccess2Bible($user, $outputBible); // Go through the input Bible's books and chapters. $books = $database_bibles->getBooks($inputBible); foreach ($books as $book) { $database_logs->log($database_books->getEnglishFromId($book)); $chapters = $database_bibles->getChapters($inputBible, $book); foreach ($chapters as $chapter) { $data = $database_bibles->getChapter($inputBible, $book, $chapter); $data = Filter_Hyphenate::atTransition($firstset, $secondset, $data); $database_bibles->storeChapter($outputBible, $book, $chapter, $data); } } $database_logs->log("The Bible has been hyphenated");
/** * This filter inserts soft hyphens in $text. * It goes through $text character by character. * At the transition from any character in $firstset * to any character in $secondset, it inserts a soft hyphen. * $firstset: array of characters. * $secondset: array of characters. * $text: A string of text to operate on. * Returns: The hyphenated $text. */ public static function atTransition($firstset, $secondset, $text) { // Verify the input. if (!is_array($firstset)) { return $text; } if (count($firstset) == 0) { return $text; } if (!is_array($secondset)) { return $text; } if (count($secondset) == 0) { return $text; } if (!is_string($text)) { return ""; } // Split the text up into lines and go through each one. $lines = explode("\n", $text); foreach ($lines as &$line) { // Split the line up into an array of UTF8 Unicode characters. $characters = array(); if ($line != "") { preg_match_all('/./u', $line, $characters); $characters = $characters[0]; } // Processor flags. $previousCharacterWasRelevant = false; $thisCharacterIsRelevant = false; $isUsfm = false; // Process each character. foreach ($characters as $key => &$character) { // Skip USFM marker. if ($character == "\\") { $isUsfm = true; } if (!$isUsfm) { // Check whether to insert the soft hyphen here. $thisCharacterIsRelevant = in_array($character, $secondset); if ($thisCharacterIsRelevant && $previousCharacterWasRelevant) { if (!Filter_Hyphenate::nearWhiteSpace($characters, $key)) { $character = Filter_Character::softHyphen() . $character; } } // Flag for next iteration. $previousCharacterWasRelevant = in_array($character, $firstset); } if ($isUsfm) { // Look for the end of the USFM marker. if ($character == " ") { $isUsfm = false; } if ($character == "*") { $isUsfm = false; } } } // Re-assemble the line from the separate (updated) characters. $line = implode("", $characters); } // Assemble the hyphenated text from the separate lines. $text = implode("\n", $lines); return $text; }