parseString() public method

parse a HTML string
public parseString ( string $html ) : string
$html string
return string markdown formatted
コード例 #1
0
 /**
  * @inheritdoc
  */
 public function transform($value)
 {
     if (is_null($value)) {
         return $value;
     }
     if (is_scalar($value)) {
         $value = (string) $value;
     }
     if (!is_string($value)) {
         throw new TransformationFailedException(sprintf('Expected a string to transform, got %s instead', json_encode($value)));
     }
     // replace non-breaking spaces, somehow this results in a question mark when markdownifying
     $value = str_replace([' ', " "], ' ', $value);
     // remove leading spaces/tabs
     $value = preg_replace('/^[ \\t]+/m', '', $value);
     // purify the html first
     $value = $this->purifier->purify($value);
     // perform some replacements...
     $replacements = [[['/>\\s+</', '/\\s+<\\//'], ['><', '</']], [['/\\s+<br\\/?>/', '/<br\\/?>\\s+/'], '<br>'], ['/([^>])\\n([^<])/', '\\1<br>\\2'], ['/(<(p|li)>)<br\\s?\\/?>/i', '\\1'], ['/<br\\s?\\/?>(<\\/(p|li)>)/i', '\\1']];
     foreach ($replacements as list($search, $replace)) {
         $value = preg_replace($search, $replace, $value);
     }
     // strip tags in headings
     foreach (range(1, 6) as $headingSize) {
         $value = preg_replace_callback('/(<h' . $headingSize . '>)(.*)(<\\/h' . $headingSize . '>)/iU', function ($matches) {
             if (count($matches) !== 4) {
                 return $matches[0];
             }
             return $matches[1] . trim(strip_tags(str_replace('<br>', ' ', $matches[2]))) . $matches[3];
         }, $value);
     }
     // remove any double bullets
     $value = preg_replace('/(<li>\\s*)[\\*|\\-]{1}/im', '\\1', $value);
     // convert to markdown
     $value = @$this->converter->parseString($value);
     // Fix different types of bullets. What this does is check each line if it starts with any of "-ו○",
     // not followed by another bullet, and normalizes it to "* text".
     $value = preg_replace('/^[\\-ו○]\\s*([^\\-ו○])/mu', '* $1', $value);
     // Now make sure there's a newline before 2 consecutive lines that start with a bullet.
     // This could lead to superfluous newlines, but they will be corrected later on.
     $value = preg_replace('/(\\n\\* [^\\n]+){2,}/', "\n\$0", "\n" . $value);
     // remove trailing spaces/tabs
     $value = preg_replace('/[ \\t]+$/m', '', $value);
     // remove excessive newlines
     $value = preg_replace('/\\n{3,}/m', "\n\n", $value);
     return trim($value);
 }
コード例 #2
0
 /**
  * @inheritdoc
  */
 public function transform($value)
 {
     if (is_null($value)) {
         return $value;
     }
     if (is_scalar($value)) {
         $value = (string) $value;
     }
     if (!is_string($value)) {
         throw new TransformationFailedException(sprintf('Expected a string to transform, got %s instead', json_encode($value)));
     }
     // replace non-breaking spaces, somehow this results in a question mark when markdownifying
     $value = str_replace(['&nbsp;', " "], ' ', $value);
     // remove leading spaces/tabs
     $value = preg_replace('/^[ \\t]+/m', '', $value);
     // purify to remove really obscure html
     $value = $this->purifier->purify($value);
     // remove whitespace/newlines between tags: this can cause trailing
     // whitespace after markdownifying
     $value = preg_replace(['/>\\s+</', '/\\s+<\\//'], ['><', '</'], $value);
     // also remove whitespace/newlines around <br> tags
     $value = preg_replace(['/\\s+<br\\/?>/', '/<br\\/?>\\s+/'], '<br>', $value);
     // Replace newlines with <br> if the newline is not between 2 tags
     $value = preg_replace('/([^>])\\n([^<])/', '\\1<br>\\2', $value);
     // Remove <br>'s at the beginning of a paragraph
     $value = preg_replace('/(<(p|li)>)<br\\s?\\/?>/i', '\\1', $value);
     // Remove <br>'s at the end of a paragraph
     $value = preg_replace('/<br\\s?\\/?>(<\\/(p|li)>)/i', '\\1', $value);
     // replace •-bullets
     $value = preg_replace('/•/', '*', $value);
     // convert to markdown
     $value = @$this->converter->parseString($value);
     // remove trailing spaces/tabs
     $value = preg_replace('/[ \\t]+$/m', '', $value);
     // remove excessive newlines
     $value = preg_replace('/\\n{3,}/m', "\n\n", $value);
     return $value;
 }
コード例 #3
0
 public function testResetState()
 {
     // Broken (unclosed) tags cause properties (such as indents) to run onto subsequent strings,
     $blockquote = 'Test blockquote <blockquote>Here it is';
     $linebreaks = 'Test<br /><br />Linebreaks';
     $converter = new Converter();
     $bqOutput = $converter->parseString($blockquote);
     $this->assertContains('>', $bqOutput);
     $lbOutput = $converter->parseString($linebreaks);
     $this->assertNotContains('>', $lbOutput);
 }