/** * Parses a piece of text into a number of paragraphs. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $regex = '/(?:(?:\\r\\n|\\n|\\x0b|\\r(?!\\n)|\\f|\\x85)\\s*){2,}/'; /* Matches 2+ newline character: LF, CR CRLF and unicode linebreaks. We can't use the more efficient '\R' here as: (a) it is only supported by PCRE 7.0+ (b) we allow '\r' but need to make sure \r\n is not interpreted as a double line break. (c) meed to allow whitespace between line breaks */ $paragraphs = preg_split($regex, $element->getContent()); foreach ($paragraphs as $p) { $p = trim($p); if (strlen($p) > 0) { $element->addChild(new T_Text_Paragraph($p)); } } $element->setContent(null); }
/** * Parses a piece of text into a number of headers. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $divider = '(?:' . $lf . '[ \\t]*[\\+-]+[ \\t]*)'; $table = '!' . $lf . '*' . $divider . '?' . $lf . '[ \\t]*(\\|.+' . '(?:' . $lf . '[ \\t]*\\|.+|' . $divider . ')*)' . '[ \\t]*' . $lf . '!u'; // last LF to remove ambiguity about where to end table $content = $element->getContent(); $num = preg_match_all($table, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no tables */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* table set */ $element->addChild($table = new T_Text_Table()); $this->populateTable($matches[1][$i][0], $table); // ^ note using sub-pattern content $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); // update offset } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of headers. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $url_prefix = '(?>http:\\/\\/|https:\\/\\/|ftp:\\/\\/|mailto:|\\/)'; /* use atomic grouping as minor performance incentive */ $regex = '/\\[' . '(' . $url_prefix . '[^\\s]+)' . '\\s' . '([^\\]]+)' . '\\]/u'; /* closing bracket */ $content = $element->getContent(); $matches = null; $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no links */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* link */ $url = $matches[1][$i][0]; $text = $matches[2][$i][0]; if (strncmp($url, '/', 1) === 0) { $link = new T_Text_InternalLink($text, $url); } else { $link = new T_Text_ExternalLink($text, $url); } $element->addChild($link); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of bits of emphasised text. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $content = $element->getContent(); $regex = '/(\\_|\\^)([^ \\t\\{\\^-]|\\{[^\\^-]+?\\})/u'; // ^ question mark makes dot repetition LAZY $matches = null; $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; } /* no change, as no super/supercripts text */ $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* super/sub */ $ss = $matches[2][$i][0]; // now remove any starting/ending curly brackets $ss = strncmp($ss, '{', 1) === 0 ? mb_substr($ss, 1, mb_strlen($ss) - 2) : $ss; if (strcmp($matches[1][$i][0], '^') === 0) { $script = new T_Text_Superscript($ss); } else { $script = new T_Text_Subscript($ss); } $element->addChild($script); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses embedded links out from the text. * * @param T_Text_Element $element */ protected function parse(T_Text_Parseable $element) { $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $url_prefix = '(?>http:\\/\\/|https:\\/\\/|\\/)'; /* use atomic grouping as minor performance incentive */ $regex = '/' . $lf . '\\s*\\!' . '(' . $url_prefix . '[^\\s]+)' . '(\\s[^\\!]+)?' . '\\!/u'; /* closing exclamation mark */ $content = $element->getContent(); $matches = null; $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no embedded links */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* link */ $url = $matches[1][$i][0]; $text = isset($matches[2][$i][0]) ? $matches[2][$i][0] : null; $link = new T_Text_Resource($text, $url); $element->addChild($link); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of bits of emphasised text. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $content = $element->getContent(); /* for performance reasons, we avoid executing the regex if there is no double asterisk in the text. */ if (strpos($content, '**') === false) { return; } $regex = '/\\*\\*(.+?)\\*\\*/u'; // question mark makes dot repetition LAZY $matches = null; $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; } /* no change, as no emphasised text */ $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* emphasised */ $emph = new T_Text_Emph($matches[1][$i][0]); $element->addChild($emph); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text to capture any dividers. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $content = $element->getContent(); /* for performance reasons, we avoid executing the regex if there is no quad-dash in the text. */ if (strpos($content, '----') === false) { return; } $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $regex = '/' . $lf . '[ \\t]*[-]{4,}[ \\t]*' . $lf . '/'; $matches = null; $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; } /* no dividers */ $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* divider */ $element->addChild(new T_Text_Divider(null)); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of quotations. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $delimit = preg_quote('""'); $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $regex = '/' . $lf . $delimit . '\\s*' . $lf . '(.+?)' . $lf . $delimit . '([^' . $lf . ']*)' . $lf . '/su'; /* line feed at end */ // note the trailing 's', this puts the regex in multi-line mode and // means that the 'dot' in the middle matches newlines $content = $element->getContent(); $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no quotes */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* quote */ $quote = mb_trim($matches[1][$i][0]); $cite = mb_trim($matches[2][$i][0]); $element->addChild(new T_Text_Quote($cite, $quote)); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of headers. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $regex = '/' . $lf . '\\s*' . '(\\={2,7})' . '(.+)' . '\\1' . '\\s*' . $lf . '/u'; /* line feed at end */ $content = $element->getContent(); $num = preg_match_all($regex, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no headers */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* header */ $level = strlen($matches[1][$i][0]) - 1; $element->addChild(new T_Text_Header($level, mb_trim($matches[2][$i][0]))); /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }
/** * Parses a piece of text into a number of headers. * * @param T_Text_Parseable $element */ protected function parse(T_Text_Parseable $element) { $lf = '(?:\\r\\n|\\n|\\x0b|\\r|\\f|\\x85|^|$)'; $types = '['; foreach ($this->types as $t) { $types .= preg_quote($t); } $types .= ']'; /* types to look for e.g. [\*\-] */ $li = $lf . '[\\s]*' . $types . '[^\\*].*'; /* line breaks followed by optional whitespace, then delimiter, not then content (remember the '.' does not by default include line breaks. Note that the space is important here otherwise */ $block = '/' . $li . '(?:' . $li . '|[' . $lf . '\\s]+)*/u'; /* a list block: started by an initial list bloc, it then continues including any blank lines for the entire list section. */ $content = $element->getContent(); $num = preg_match_all($block, $content, $matches, PREG_OFFSET_CAPTURE); if ($num < 1) { return; /* no change, as no lists */ } $offset = 0; /* Note that the offset produced from preg_match_all is in bytes, not unicode characters. Therefore, in the following section we do NOT use the mb_* functions to assess length, as we are working in bytes not characters. */ for ($i = 0; $i < $num; $i++) { /* pre content */ if ($offset < $matches[0][$i][1]) { $pre = substr($content, $offset, $matches[0][$i][1] - $offset); $element->addChild(new T_Text_Plain($pre)); } /* list set */ $list_block = $matches[0][$i][0]; $items = preg_split('/' . $lf . '/u', $list_block); $list_objs = array(); $li_regex = '/^([\\s]*)(' . $types . ')\\s*(.+)$/u'; $li_matches = null; foreach ($items as $line) { if (strlen(trim($line)) <= 1) { // no content, or just list delimiter, skip to next continue; } elseif (preg_match($li_regex, $line, $li_matches)) { $li_level = strlen($li_matches[1]); $li_type = $li_matches[2]; $li_content = new T_Text_ListItem($li_matches[3]); if (count($list_objs) == 0) { // if no lists objects at all, create one. $list_objs[$li_level] = new T_Text_List($li_type); $element->addChild($list_objs[$li_level]); } if ($li_level != ($key = key($list_objs))) { if ($li_level > $key) { // if the indent is going up above from the current key // this indicates a nested list -- note we need to add the // nested list as a child to the last list item on the list // we're nesting into.. $list_objs[$li_level] = new T_Text_List($li_type); $list_objs[$key]->getLastChild()->addChild($list_objs[$li_level]); end($list_objs); // keep current pos at end } else { // the indent is going down, so we need to fallback on a // previous value. if (!isset($list_objs[$li_level])) { // indent has been screwed up. What we do in this situation // is find the first level above the current level and pretend // we 'dropped' to that. foreach (array_keys($list_objs) as $value) { if ($value > $li_level) { break; } } $li_level = $value; } // remove all higher levels foreach (array_keys($list_objs) as $value) { if ($value > $li_level) { unset($list_objs[$value]); } } end($list_objs); } } $list_objs[$li_level]->addChild($li_content); } // line preg_match: should never *not* match! } /* update offset */ $offset = $matches[0][$i][1] + strlen($matches[0][$i][0]); } /* post content */ if ($offset < strlen($content)) { $post = substr($content, $offset); $element->addChild(new T_Text_Plain($post)); } /* reset original content */ $element->setContent(null); }