/** * Test example messages */ public function testExampleMessages() { foreach (new DirectoryIterator(dirname(dirname(dirname(__DIR__))) . '/example_messages') as $file) { if ($file->isDot() || $file->isDir()) { continue; } if ($file->isFile() && $file->getExtension() == 'eml') { $this->assertEquals('Email Reply', EmailReplyExtractor::extractReply($file->getPathname())); } } }
/** * Test example messages */ public function testExampleMessages() { foreach (new DirectoryIterator(dirname(dirname(dirname(__DIR__))) . '/example_messages') as $file) { if ($file->isDot() || $file->isDir()) { continue; } if ($file->isFile() && $file->getExtension() == 'eml') { $expected_text = $this->setExpectedTextByFileName($file->getFilename()); $this->assertEquals($expected_text, EmailReplyExtractor::extractReplyEml($file->getPathname())); // assert that text of the stripped emails match the original text even with <br /> instead of new row char $this->assertEquals(nl2br($expected_text), nl2br(EmailReplyExtractor::extractReplyEml($file->getPathname()))); } } }
/** * @param string $html * @return string */ static function toPlainText($html) { $plain = (string) $html; // strip slashes $plain = (string) trim(stripslashes($plain)); // strip unnecessary characters $plain = (string) preg_replace(["/\r/", "/<script[^>]*>.*?<\\/script>/si", "/<style[^>]*>.*?<\\/style>/is", "/style=\".*?\"/"], "", $plain); // entities to convert (this is not a definite list) $entities = [' ' => [' ', ' '], '"' => ['"', '”', '“', '“', '”', '“', '”'], '\'' => [''', '’', '‘', '‘', '’'], '>' => ['>'], '<' => ['<'], '&' => ['&', '&'], '(c)' => ['©', '©'], '(R)' => ['®', '®'], '(tm)' => ['™', '™', '™'], '--' => ['—', '—', '—'], '-' => ['–', '−', '–', '−'], '*' => ['•', '•', '•'], '�' => ['£', '£'], 'EUR' => ['€', '€']]; // convert specified entities foreach ($entities as $character => $entity) { $plain = (string) str_replace($entity, $character, $plain); } // strip other not previously converted entities $plain = (string) preg_replace(['/&[^&;]+;/si'], "", $plain); // <p> converts to 2 newlines $plain = (string) preg_replace('/<p[^>]*>/i', "\n\n", $plain); // <p> // new line after div $plain = (string) preg_replace('/<div[^>]*>/i', "\n", $plain); // <div> // uppercase html elements $plain = (string) preg_replace_callback('/<h[123456][^>]*>(.*?)<\\/h[123456]>/i', function ($matches) { return "\n\n" . mb_strtoupper($matches[1]) . "\n\n"; }, $plain); // <h1-h6> $plain = (string) preg_replace_callback(['/<b[^>]*>(.*?)<\\/b>/i', '/<strong[^>]*>(.*?)<\\/strong>/i'], function ($matches) { return $matches[1]; }, $plain); // <b> <strong> // deal with italic elements $plain = (string) preg_replace(array('/<i[^>]*>(.*?)<\\/i>/i', '/<em[^>]*>(.*?)<\\/em>/i'), '_\\1_', $plain); // <i> <em> // elements that convert to 2 newlines $plain = (string) preg_replace(array('/(<ul[^>]*>|<\\/ul>)/i', '/(<ol[^>]*>|<\\/ol>)/i', '/(<table[^>]*>|<\\/table>)/i'), "\n\n", $plain); // <ul> <ol> <table> // elements that convert to single newline $plain = (string) preg_replace(array('/<br[^>]*>/i', '/(<tr[^>]*>|<\\/tr>)/i'), "\n", $plain); // <br> <tr> // images $plain = (string) preg_replace(array('/<img\\s+[^>]*src="([^"]*)"[^>]*>/i'), "[Image: \\1]", $plain); // <br> <tr> // <hr> converts to --------------------//--- $plain = (string) preg_replace('/<hr[^>]*>/i', "\n-------------------------\n", $plain); // <hr> // other table tags $plain = (string) preg_replace('/<td[^>]*>(.*?)<\\/td>/i', "\t\\1\n", $plain); // <td> $plain = (string) preg_replace_callback('/<th[^>]*>(.*?)<\\/th>/i', function ($matches) { return "\t\t" . mb_strtoupper($matches[0]) . "\n"; }, $plain); // <th> // list elements $plain = (string) preg_replace('/<li[^>]*>(.*?)<\\/li>/i', "* \\1\n", $plain); // <li>with content</li> $plain = (string) preg_replace('/<li[^>]*>/i', "\n* ", $plain); // <li /> // handle anchors $plain = (string) preg_replace_callback('/<a [^>]*href="([^"]+)"[^>]*>(.*?)<\\/a>/i', function ($matches) { $url = $matches[1]; $text = $matches[2]; if (EmailReplyExtractor::strStartsWith($url, 'http://') || EmailReplyExtractor::strStartsWith($url, 'https://')) { return "{$text} [{$url}]"; } else { if (EmailReplyExtractor::strStartsWith($url, 'mailto:')) { return $text . ' [' . substr($url, 7) . ']'; } else { return $text; } } }, $plain); // <a href="$url">$text</a> // handle blockquotes $plain = (string) preg_replace_callback('/<blockquote[^>]*>(.*?)<\\/blockquote>/is', function ($blockquote_content) { $blockquote_content = isset($blockquote_content[1]) ? $blockquote_content[1] : ''; $lines = (array) explode("\n", $blockquote_content); $return = array(); if (!empty($lines)) { foreach ($lines as $line) { $return[] = '> ' . $line; } } return "\n\n" . implode("\n", $return) . "\n\n"; }, $plain); $plain = (string) preg_replace('/<title[^>]*>(.*?)<\\/title>/i', "", $plain); // remove unnecessary title tag // strip other tags $plain = (string) strip_tags($plain); // clean up unnecessary newlines $plain = (string) preg_replace("/\n\\s+\n/", "\n\n", $plain); $plain = (string) preg_replace("/[\n]{3,}/", "\n\n", $plain); return trim($plain); }
/** * Test if we can detect Hotmail */ public function testDetectHotmail() { $this->assertEquals(EmailReplyExtractor::HOTMAIL, EmailReplyExtractor::detectMailer(['received' => "from BAY174-W36 ([65.54.190.187]) by bay0-omc3-s10.bay0.hotmail.com with Microsoft SMTPSVC(6.0.3790.4675);\n Wed, 22 Jan 2014 18:02:13 -0800"])); }
/** * Test if we can detect MailRu */ public function testDetectMailRu() { $this->assertEquals(EmailReplyExtractor::MAIL_RU_MAIL, EmailReplyExtractor::detectMailer(['message-id' => '<*****@*****.**>'])); }