Beispiel #1
0
 /**
  * Parses a piece of text into a number of paragraphs.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $regex = '/(?:(?:\\r\\n|\\n|\\x0b|\\r(?!\\n)|\\f|\\x85)\\s*){2,}/';
     /* Matches 2+ newline character: LF, CR CRLF and unicode linebreaks.
        We can't use the more efficient '\R' here as:
          (a) it is only supported by PCRE 7.0+
          (b) we allow '\r' but need to make sure \r\n is not interpreted as
              a double line break.
          (c) meed to allow whitespace between line breaks */
     $paragraphs = preg_split($regex, $element->getContent());
     foreach ($paragraphs as $p) {
         $p = trim($p);
         if (strlen($p) > 0) {
             $element->addChild(new T_Text_Paragraph($p));
         }
     }
     $element->setContent(null);
 }
Beispiel #2
0
 /**
  * Parses a piece of text into a number of headers.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $divider = '(?:' . $lf . '[ \\t]*[\\+-]+[ \\t]*)';
     $table = '!' . $lf . '*' . $divider . '?' . $lf . '[ \\t]*(\\|.+' . '(?:' . $lf . '[ \\t]*\\|.+|' . $divider . ')*)' . '[ \\t]*' . $lf . '!u';
     // last LF to remove ambiguity about where to end table
     $content = $element->getContent();
     $num = preg_match_all($table, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no tables */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* table set */
         $element->addChild($table = new T_Text_Table());
         $this->populateTable($matches[1][$i][0], $table);
         // ^ note using sub-pattern content
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
         // update offset
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #3
0
 /**
  * Parses a piece of text into a number of headers.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $url_prefix = '(?>http:\\/\\/|https:\\/\\/|ftp:\\/\\/|mailto:|\\/)';
     /* use atomic grouping as minor performance incentive */
     $regex = '/\\[' . '(' . $url_prefix . '[^\\s]+)' . '\\s' . '([^\\]]+)' . '\\]/u';
     /* closing bracket */
     $content = $element->getContent();
     $matches = null;
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no links */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* link */
         $url = $matches[1][$i][0];
         $text = $matches[2][$i][0];
         if (strncmp($url, '/', 1) === 0) {
             $link = new T_Text_InternalLink($text, $url);
         } else {
             $link = new T_Text_ExternalLink($text, $url);
         }
         $element->addChild($link);
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
 /**
  * Parses a piece of text into a number of bits of emphasised text.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $content = $element->getContent();
     $regex = '/(\\_|\\^)([^ \\t\\{\\^-]|\\{[^\\^-]+?\\})/u';
     // ^ question mark makes dot repetition LAZY
     $matches = null;
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
     }
     /* no change, as no super/supercripts text */
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* super/sub */
         $ss = $matches[2][$i][0];
         // now remove any starting/ending curly brackets
         $ss = strncmp($ss, '{', 1) === 0 ? mb_substr($ss, 1, mb_strlen($ss) - 2) : $ss;
         if (strcmp($matches[1][$i][0], '^') === 0) {
             $script = new T_Text_Superscript($ss);
         } else {
             $script = new T_Text_Subscript($ss);
         }
         $element->addChild($script);
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #5
0
 /**
  * Parses embedded links out from the text.
  *
  * @param T_Text_Element $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $url_prefix = '(?>http:\\/\\/|https:\\/\\/|\\/)';
     /* use atomic grouping as minor performance incentive */
     $regex = '/' . $lf . '\\s*\\!' . '(' . $url_prefix . '[^\\s]+)' . '(\\s[^\\!]+)?' . '\\!/u';
     /* closing exclamation mark */
     $content = $element->getContent();
     $matches = null;
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no embedded links */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* link */
         $url = $matches[1][$i][0];
         $text = isset($matches[2][$i][0]) ? $matches[2][$i][0] : null;
         $link = new T_Text_Resource($text, $url);
         $element->addChild($link);
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #6
0
 /**
  * Parses a piece of text into a number of bits of emphasised text.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $content = $element->getContent();
     /* for performance reasons, we avoid executing the regex if there is no
        double asterisk in the text. */
     if (strpos($content, '**') === false) {
         return;
     }
     $regex = '/\\*\\*(.+?)\\*\\*/u';
     // question mark makes dot repetition LAZY
     $matches = null;
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
     }
     /* no change, as no emphasised text */
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* emphasised */
         $emph = new T_Text_Emph($matches[1][$i][0]);
         $element->addChild($emph);
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #7
0
 /**
  * Parses a piece of text to capture any dividers.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $content = $element->getContent();
     /* for performance reasons, we avoid executing the regex if there is no
        quad-dash in the text. */
     if (strpos($content, '----') === false) {
         return;
     }
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $regex = '/' . $lf . '[ \\t]*[-]{4,}[ \\t]*' . $lf . '/';
     $matches = null;
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
     }
     /* no dividers */
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* divider */
         $element->addChild(new T_Text_Divider(null));
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #8
0
 /**
  * Parses a piece of text into a number of quotations.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $delimit = preg_quote('""');
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $regex = '/' . $lf . $delimit . '\\s*' . $lf . '(.+?)' . $lf . $delimit . '([^' . $lf . ']*)' . $lf . '/su';
     /* line feed at end */
     // note the trailing 's', this puts the regex in multi-line mode and
     // means that the 'dot' in the middle matches newlines
     $content = $element->getContent();
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no quotes */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* quote */
         $quote = mb_trim($matches[1][$i][0]);
         $cite = mb_trim($matches[2][$i][0]);
         $element->addChild(new T_Text_Quote($cite, $quote));
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #9
0
 /**
  * Parses a piece of text into a number of headers.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $regex = '/' . $lf . '\\s*' . '(\\={2,7})' . '(.+)' . '\\1' . '\\s*' . $lf . '/u';
     /* line feed at end */
     $content = $element->getContent();
     $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no headers */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* header */
         $level = strlen($matches[1][$i][0]) - 1;
         $element->addChild(new T_Text_Header($level, mb_trim($matches[2][$i][0])));
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }
Beispiel #10
0
 /**
  * Parses a piece of text into a number of headers.
  *
  * @param T_Text_Parseable $element
  */
 protected function parse(T_Text_Parseable $element)
 {
     $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)';
     $types = '[';
     foreach ($this->types as $t) {
         $types .= preg_quote($t);
     }
     $types .= ']';
     /* types to look for e.g. [\*\-] */
     $li = $lf . '[\\s]*' . $types . '[^\\*].*';
     /* line breaks followed by optional whitespace, then delimiter, not then content
        (remember the '.' does not by default include line breaks. Note that the space
        is important here otherwise */
     $block = '/' . $li . '(?:' . $li . '|[' . $lf . '\\s]+)*/u';
     /* a list block: started by an initial list bloc, it then continues including
        any blank lines for the entire list section. */
     $content = $element->getContent();
     $num = preg_match_all($block, $content, $matches, PREG_OFFSET_CAPTURE);
     if ($num < 1) {
         return;
         /* no change, as no lists */
     }
     $offset = 0;
     /* Note that the offset produced from preg_match_all is in bytes, not
        unicode characters. Therefore, in the following section we do NOT use
        the mb_* functions to assess length, as we are working in bytes not
        characters. */
     for ($i = 0; $i < $num; $i++) {
         /* pre content */
         if ($offset < $matches[0][$i][1]) {
             $pre = substr($content, $offset, $matches[0][$i][1] - $offset);
             $element->addChild(new T_Text_Plain($pre));
         }
         /* list set */
         $list_block = $matches[0][$i][0];
         $items = preg_split('/' . $lf . '/u', $list_block);
         $list_objs = array();
         $li_regex = '/^([\\s]*)(' . $types . ')\\s*(.+)$/u';
         $li_matches = null;
         foreach ($items as $line) {
             if (strlen(trim($line)) <= 1) {
                 // no content, or just list delimiter, skip to next
                 continue;
             } elseif (preg_match($li_regex, $line, $li_matches)) {
                 $li_level = strlen($li_matches[1]);
                 $li_type = $li_matches[2];
                 $li_content = new T_Text_ListItem($li_matches[3]);
                 if (count($list_objs) == 0) {
                     // if no lists objects at all, create one.
                     $list_objs[$li_level] = new T_Text_List($li_type);
                     $element->addChild($list_objs[$li_level]);
                 }
                 if ($li_level != ($key = key($list_objs))) {
                     if ($li_level > $key) {
                         // if the indent is going up above from the current key
                         // this indicates a nested list -- note we need to add the
                         // nested list as a child to the last list item on the list
                         // we're nesting into..
                         $list_objs[$li_level] = new T_Text_List($li_type);
                         $list_objs[$key]->getLastChild()->addChild($list_objs[$li_level]);
                         end($list_objs);
                         // keep current pos at end
                     } else {
                         // the indent is going down, so we need to fallback on a
                         // previous value.
                         if (!isset($list_objs[$li_level])) {
                             // indent has been screwed up. What we do in this situation
                             // is find the first level above the current level and pretend
                             // we 'dropped' to that.
                             foreach (array_keys($list_objs) as $value) {
                                 if ($value > $li_level) {
                                     break;
                                 }
                             }
                             $li_level = $value;
                         }
                         // remove all higher levels
                         foreach (array_keys($list_objs) as $value) {
                             if ($value > $li_level) {
                                 unset($list_objs[$value]);
                             }
                         }
                         end($list_objs);
                     }
                 }
                 $list_objs[$li_level]->addChild($li_content);
             }
             // line preg_match: should never *not* match!
         }
         /* update offset */
         $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]);
     }
     /* post content */
     if ($offset < strlen($content)) {
         $post = substr($content, $offset);
         $element->addChild(new T_Text_Plain($post));
     }
     /* reset original content */
     $element->setContent(null);
 }