TextProcessor PHP Code Examples

Example #1

0

Show file

File: Translator.php Project: barcodex/templar

 public function translateTemplate($untranslated, $moduleName, $templateName, $language = null)
 {
     // $translations is expected to be an array
     $translations = Locator::getInstance()->getTemplateTranslation($moduleName, $templateName, $language);
     if (is_array($translations)) {
         $translations = array();
     }
     return TextProcessor::doTextVariation($untranslated, '[[', ']]', $translations);
 }

Example #2

0

Show file

File: Debug.php Project: nevstokes/Utility-Belt

 /**
  * Output information about a variable
  * 
  * Context-sensitive replacement for the native var_dump() function.
  * If the xdebug extension is installed or if text/plain HTTP content
  * headers have been sent then var_dump() is used as is. Otherwise the
  * var_dump() call is wrapped with <pre> containter tags.
  * 
  * @param mixed $var
  */
 public static function dump($var)
 {
     // Check if it's actually okay to output anything
     if (true === DEBUG) {
         $text = new TextProcessor();
         // Don't want to shoot our bolt too soon
         ob_start();
         if (extension_loaded('xdebug')) {
             // Bliss, nothing extra for us to do
             var_dump($var);
         } elseif (headers_sent()) {
             $http = headers_list();
             // Try to discover what kind of context we're in
             foreach ($http as $index => $header) {
                 list($key, $val) = explode(': ', $header);
                 if ('Content-type' == $key) {
                     if ('text/plain' == substr($val, 0, 10)) {
                         var_dump($var);
                     } elseif ('text/html' == substr($val, 0, 9)) {
                         echo $text->wrapWithTag($var, 'pre');
                     }
                     break;
                 }
             }
         }
         // Gather what output we may already have
         $info = ob_get_clean();
         if (false === empty($info)) {
             // We have output already
             echo $info;
         } else {
             if ('text/html' == ini_get('default_mimetype')) {
                 // If the default mimetype is HTML
                 echo $text->wrapWithTag($var, 'pre');
             } else {
                 // Tried everything else, resort to simple var_dump
                 var_dump($var);
             }
         }
     }
 }

Example #3

0

Show file

File: doc_processor.php Project: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of Word Doc data (2004 or earlier).
  *
  * @param string $page  the web-page contents
  * @param string $url  the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $text = "";
     if (is_string($page)) {
         $text = self::extractASCIIText($page);
     }
     if ($text == "") {
         $text = $url;
     }
     $summary = parent::process($text, $url);
     return $summary;
 }

Example #4

0

Show file

File: sitemap_processor.php Project: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of rss news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         $dom = self::dom($page);
         if ($dom !== false) {
             $summary[self::TITLE] = $url;
             $summary[self::DESCRIPTION] = "Sitemap of " . $url;
             $summary[self::LANG] = "en-US";
             $summary[self::LINKS] = self::links($dom, $url);
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) {
                 //maybe not a sitemap? treat as text still try to get urls
                 $summary = parent::process($page, $url);
             }
             $summary[self::JUST_METAS] = true;
         } else {
             $summary = parent::process($page, $url);
             $summary[self::JUST_METAS] = true;
         }
     }
     return $summary;
 }

Example #5

0

Show file

File: html_processor.php Project: yakar/yioop

 /**
  * Returns summary of body of a web page based on crude regex matching
  *     used as a fall back if dom parsing did not work.
  *
  * @param string $page to extract description from
  * @return string  a title of the page
  */
 static function crudeDescription($page)
 {
     $body = parent::getBetweenTags($page, 0, "<body", "</body");
     $body = preg_replace("/\\</", " <", $body);
     $body = strip_tags("<body" . $body[1] . "</body>");
     if ($body == "") {
         return $body;
     }
     $body = preg_replace("/\\s+/", " ", $body);
     return mb_substr($body, 0, self::$max_description_len);
 }

Example #6

0

Show file

File: wiki_parser.php Project: yakar/yioop

 /**
  * Parses a mediawiki document to produce an HTML equivalent
  *
  * @param string $document a document which might have mediawiki markup
  * @param bool $parse_head_vars header variables are an extension of
  *     mediawiki syntax used to add meta variable and titles to
  *     the head tag of an html document. This flag controls whether to
  *     supprot this extension or not
  * @param bool $handle_big_files for indexing purposes Yioop by default
  *     truncates long documents before indexing them. If true, this
  *     method does not do this default truncation. The true value
  *     is more useful when using Yioop's built-in wiki.
  * @return string HTML document obtained by parsing mediawiki
  *     markup in $document
  */
 function parse($document, $parse_head_vars = true, $handle_big_files = false)
 {
     $head = "";
     $page_type = "standard";
     $head_vars = array();
     $draw_toc = true;
     if ($parse_head_vars && !$this->minimal) {
         $document_parts = explode("END_HEAD_VARS", $document);
         if (count($document_parts) > 1) {
             $head = $document_parts[0];
             $document = $document_parts[1];
             $head_lines = preg_split("/\n\\s*\n/", $head);
             foreach ($head_lines as $line) {
                 $semi_pos = strpos($line, ";") ? strpos($line, ";") : strlen($line);
                 $line = substr($line, 0, $semi_pos);
                 $line_parts = explode("=", $line);
                 if (count($line_parts) == 2) {
                     $head_vars[trim(addslashes($line_parts[0]))] = addslashes(trim($line_parts[1]));
                 }
             }
             if (isset($head_vars['page_type'])) {
                 $page_type = $head_vars['page_type'];
             }
             if (isset($head_vars['toc'])) {
                 $draw_toc = $head_vars['toc'];
             }
         }
     }
     $document = preg_replace_callback("/&lt;nowiki&gt;(.+?)&lt;\\/nowiki&gt;/s", "base64EncodeCallback", $document);
     $document = preg_replace_callback("/&lt;pre&gt;(.+?)&lt;\\/pre&gt;/s", "spaceEncodeCallback", $document);
     if (!$this->minimal) {
         if ($draw_toc && $page_type != "presentation") {
             $toc = $this->makeTableOfContents($document);
         }
         list($document, $references) = $this->makeReferences($document);
     }
     $document = preg_replace_callback('/(\\A|\\n){\\|(.*?)\\n\\|}/s', "makeTableCallback", $document);
     if ($page_type == "presentation") {
         $lines = explode("\n", $document);
         $out_document = "";
         $slide = "";
         $div = "<div class='slide'>";
         foreach ($lines as $line) {
             if (trim($line) == "....") {
                 $slide = $this->processRegexes($slide);
                 $out_document .= $div . $this->cleanLinksAndParagraphs($slide) . "</div>";
                 $slide = "";
             } else {
                 $slide .= $line . "\n";
             }
         }
         $document = $out_document . $div . $this->processRegexes($slide) . "</div>";
     } else {
         if ($handle_big_files) {
             $document = $this->processRegexes($document);
             $document = $this->cleanLinksAndParagraphs($document);
         } else {
             if (strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) {
                 $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN);
             }
             $document = $this->processRegexes($document);
             $document = $this->cleanLinksAndParagraphs($document);
         }
     }
     if (!$this->minimal) {
         $document = $this->insertReferences($document, $references);
         if ($draw_toc && $page_type != "presentation") {
             $document = $this->insertTableOfContents($document, $toc);
         }
     }
     $document = preg_replace_callback("/&lt;nowiki&gt;(.+?)&lt;\\/nowiki&gt;/s", "base64DecodeCallback", $document);
     if ($head != "" && $parse_head_vars) {
         $document = $head . "END_HEAD_VARS" . $document;
     }
     if (!$handle_big_files && strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) {
         $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN);
         TextProcessor::closeDanglingTags($document);
         $document .= "...";
     }
     return $document;
 }

Example #7

0

Show file

File: svg_processor.php Project: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of svg image. If the image is small
  * enough, an attempt is made to generate a thumbnail
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     if (is_string($page)) {
         self::closeDanglingTags($page);
         $dom = self::dom($page);
         if ($dom !== false && isset($dom->documentElement)) {
             $summary[self::TITLE] = "";
             $summary[self::DESCRIPTION] = self::description($dom);
             $summary[self::LINKS] = array();
             $summary[self::PAGE] = "<html><body><div><img src='data:image/svg+xml;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>";
             if (strlen($page) < self::MAX_THUMB_LEN) {
                 $thumb_string = self::createThumb($dom);
                 $summary[self::THUMB] = 'data:image/svg+xml;base64,' . base64_encode($thumb_string);
             }
         } else {
             $summary = parent::process($page, $url);
         }
     }
     return $summary;
 }

Example #8

0

Show file

File: xml_processor.php Project: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of rss news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         self::closeDanglingTags($page);
         $dom = self::dom($page);
         $root_name = isset($dom->documentElement->nodeName) ? $dom->documentElement->nodeName : "";
         unset($dom);
         $XML_PROCESSORS = array("rss" => "RssProcessor", "html" => "HtmlProcessor", "sitemapindex" => "SitemapProcessor", "urlset" => "SitemapProcessor", "svg" => "SvgProcessor");
         if (isset($XML_PROCESSORS[$root_name])) {
             $processor_name = $XML_PROCESSORS[$root_name];
             $processor = new $processor_name($this->plugin_instances);
             $summary = $processor->process($page, $url);
         } else {
             $summary = parent::process($page, $url);
         }
     }
     return $summary;
 }

Example #9

0

Show file

File: ppt_processor.php Project: yakar/yioop

 /**
  * Computes a summary based on a string of a binary Powerpoint document
  * (as opposed to the modern xml powerpoint format).
  *
  * Text is extracted from the Powerpoint document using a crude finite
  * state machine that was developed by looking at a few Powerpoint
  * documents in a Hex editor. Then the TextProcessor:: process() method
  * is used to make a summary
  *
  * @param string $page string of a Powerpoint document
  * @param string $url location the document came from, not used by
  *     TextProcessor at this point. Some of its subclasses override
  *     this method and use url to produce complete links for
  *     relative links within a document
  *
  * @return array a summary of (title, description,links, and content) of
  *     the information in $page
  */
 function process($page, $url)
 {
     $text = "";
     if (is_string($page)) {
         $text_objects = array();
         $cur_id = 0;
         $state = self::PPT_IGNORING;
         $cur_char_pos = 0;
         $len = strlen($page);
         while ($cur_char_pos < $len) {
             $ascii = ord($page[$cur_char_pos]);
             switch ($state) {
                 case self::PPT_IGNORING:
                     if ($ascii == 0) {
                         $state = self::ZEROONE_IGNORING;
                     }
                     break;
                 case self::ZEROONE_IGNORING:
                     if ($ascii == 0) {
                         $state = self::ZEROTWO_IGNORING;
                     } else {
                         $state = self::PPT_IGNORING;
                     }
                     break;
                 case self::ZEROTWO_IGNORING:
                     if ($ascii == 168) {
                         $state = self::FIRST_CHAR_TEXT_SEG;
                     } else {
                         if ($ascii != 0) {
                             $state = self::PPT_IGNORING;
                         }
                     }
                     break;
                 case self::FIRST_CHAR_TEXT_SEG:
                     if ($ascii == 15) {
                         $state = self::READ_LEN_TEXT_SEG;
                         $text_len = 0;
                         $text_len_pos = 0;
                     } else {
                         $state = self::PPT_IGNORING;
                     }
                     break;
                 case self::READ_LEN_TEXT_SEG:
                     if ($text_len_pos < 4) {
                         $text_len += $ascii << $text_len_pos * 8;
                         $text_len_pos++;
                     } else {
                         $state = self::SCAN_TEXT_SEG;
                         $scan_text_pos = 0;
                         $out_text = chr($ascii);
                     }
                     break;
                 case self::SCAN_TEXT_SEG:
                     if (strpos($out_text, "lick to edit Master title style") > 0) {
                         $state = self::ALWAYS_IGNORE;
                     } else {
                         if ($scan_text_pos < $text_len) {
                             if ($ascii >= 32 && $ascii <= 126 || $ascii == 10) {
                                 $out_text .= chr($ascii);
                                 $scan_text_pos++;
                             }
                         } else {
                             $text_objects[$cur_id] = $out_text;
                             $cur_id++;
                             $state = self::PPT_IGNORING;
                         }
                     }
                     break;
                 case self::ALWAYS_IGNORE:
                     break;
             }
             $cur_char_pos++;
         }
         $text = implode("\n", $text_objects);
     }
     if ($text == "") {
         $text = $url;
     }
     $summary = parent::process($text, $url);
     return $summary;
 }

Example #10

0

Show file

File: rss_processor.php Project: yakar/yioop

 /**
  * Used to extract the title, description and links from
  * a string consisting of rss or atom news feed data.
  *
  * @param string $page   web-page contents
  * @param string $url   the url where the page contents came from,
  *    used to canonicalize relative links
  *
  * @return array  a summary of the contents of the page
  *
  */
 function process($page, $url)
 {
     $summary = NULL;
     if (is_string($page)) {
         $dom = self::dom($page);
         $atom = false;
         $feed_nodes = $dom->getElementsByTagName('feed');
         if ($feed_nodes->length > 0) {
             $atom = true;
         }
         if ($dom !== false) {
             $summary[self::TITLE] = self::title($dom, $atom);
             $summary[self::DESCRIPTION] = self::description($dom, $atom);
             $summary[self::LANG] = self::lang($dom, $summary[self::DESCRIPTION]);
             $summary[self::LINKS] = self::links($dom, $url, $atom);
             if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) {
                 //maybe not rss or atom? treat as text still try to get urls
                 $summary = parent::process($page, $url);
             }
         }
     }
     return $summary;
 }

PHP TextProcessor Examples