/** * @covers \Fisharebest\LibGedcom\Encodings\EncodingHelper::g */ public function testUtf16MagicStrings() { $encoding_helper = new EncodingHelper(); foreach ($encoding_helper->utf16MagicStrings() as $key => $value) { $this->assertTrue(is_string($key)); $this->assertInstanceOf(AbstractEncodingUtf::class, $value); $convert = $value->toUtf8($key); $this->assertRegExp('/^(|0 HEAD)$/', $convert); } }
/** * @param string $gedcom_record * * @return EncodingInterface */ private function detectEncodingFromHeader(string $gedcom_record) : EncodingInterface { $encoding_helper = new EncodingHelper(); // UTF encodings are unambiguous foreach ($encoding_helper->utf16MagicStrings() as $magic_string => $encoding) { if (substr_compare($gedcom_record, $magic_string, 0, strlen($magic_string)) === 0) { $this->logger->info(self::CHARSET_DETECTED, [$encoding::ENCODING_NAME]); return $encoding; } } // Use a very loose interpretation of GEDCOM, as this data is not yet normalized. preg_match('/^\\s*0+\\s*HEAD(?:ER)?[^\\r\\n]*' . '(?:[\\r\\n]\\s*0*[1-9] [^\\r\\n]*)*' . '(?:[\\r\\n]\\s*0*1 CHAR(?:ACTER)? (?P<CHAR>[^\\r\\n]*))' . '(?:[\\r\\n]\\s*0*2 TYPE (?P<TYPE>[^\\r\\n]*))?' . '/', $gedcom_record, $match); $char = trim(strtoupper($match['CHAR'] ?? '')); $type = trim(strtoupper($match['TYPE'] ?? '')); if ($type !== '') { $char .= '/' . $type; } foreach ($encoding_helper->characterSetsEncodings() as $character_sets_encoding) { list($character_sets, $encoding) = $character_sets_encoding; if (in_array($char, $character_sets)) { if ($char === $encoding::ENCODING_NAME) { $this->logger->info(self::CHARSET_DETECTED, [$char]); } else { $this->logger->error(self::CHARSET_INVALID, [$char]); $this->logger->notice(self::CHARSET_ASSUMED, [$encoding::ENCODING_NAME]); } return $encoding; } } if ($char === '') { $this->logger->error(self::CHARSET_MISSING); } else { $this->logger->error(self::CHARSET_INVALID, [$char]); } $this->logger->notice(self::CHARSET_ASSUMED, [AsciiEncoding::ENCODING_NAME]); return new AsciiEncoding(); }