/** * @param string $gedcom_record * * @return EncodingInterface */ private function detectEncodingFromHeader(string $gedcom_record) : EncodingInterface { $encoding_helper = new EncodingHelper(); // UTF encodings are unambiguous foreach ($encoding_helper->utf16MagicStrings() as $magic_string => $encoding) { if (substr_compare($gedcom_record, $magic_string, 0, strlen($magic_string)) === 0) { $this->logger->info(self::CHARSET_DETECTED, [$encoding::ENCODING_NAME]); return $encoding; } } // Use a very loose interpretation of GEDCOM, as this data is not yet normalized. preg_match('/^\\s*0+\\s*HEAD(?:ER)?[^\\r\\n]*' . '(?:[\\r\\n]\\s*0*[1-9] [^\\r\\n]*)*' . '(?:[\\r\\n]\\s*0*1 CHAR(?:ACTER)? (?P<CHAR>[^\\r\\n]*))' . '(?:[\\r\\n]\\s*0*2 TYPE (?P<TYPE>[^\\r\\n]*))?' . '/', $gedcom_record, $match); $char = trim(strtoupper($match['CHAR'] ?? '')); $type = trim(strtoupper($match['TYPE'] ?? '')); if ($type !== '') { $char .= '/' . $type; } foreach ($encoding_helper->characterSetsEncodings() as $character_sets_encoding) { list($character_sets, $encoding) = $character_sets_encoding; if (in_array($char, $character_sets)) { if ($char === $encoding::ENCODING_NAME) { $this->logger->info(self::CHARSET_DETECTED, [$char]); } else { $this->logger->error(self::CHARSET_INVALID, [$char]); $this->logger->notice(self::CHARSET_ASSUMED, [$encoding::ENCODING_NAME]); } return $encoding; } } if ($char === '') { $this->logger->error(self::CHARSET_MISSING); } else { $this->logger->error(self::CHARSET_INVALID, [$char]); } $this->logger->notice(self::CHARSET_ASSUMED, [AsciiEncoding::ENCODING_NAME]); return new AsciiEncoding(); }
/** * @covers \Fisharebest\LibGedcom\Encodings\EncodingHelper */ public function testCharacterSetsEncodings() { $encoding_helper = new EncodingHelper(); $generator = $encoding_helper->characterSetsEncodings(); $this->assertEquals([['ANSEL'], new AnselEncoding()], $generator->current()); $generator->next(); $this->assertEquals([['ASCII'], new AsciiEncoding()], $generator->current()); $generator->next(); $this->assertEquals([['UTF-8', 'UNICODE'], new Utf8Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['IBMPC', 'IBM', 'IBM-PC', 'OEM'], new Cp437Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['MSDOS', 'IBM DOS', 'MS-DOS'], new Cp850Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['WINDOWS-1250'], new Cp1250Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['WINDOWS-1251'], new Cp1251Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['ANSI', 'WINDOWS', 'IBM WINDOWS', 'IBM_WINDOWS', 'CP1252', 'ISO-8859-1', 'ISO8859-1', 'ISO8859', 'LATIN1'], new Cp1252Encoding()], $generator->current()); $generator->next(); $this->assertEquals([['MACINTOSH', 'ASCII/MACOS ROMAN'], new MacintoshEncoding()], $generator->current()); $generator->next(); $this->assertNull($generator->current()); }