public function translateTemplate($untranslated, $moduleName, $templateName, $language = null) { // $translations is expected to be an array $translations = Locator::getInstance()->getTemplateTranslation($moduleName, $templateName, $language); if (is_array($translations)) { $translations = array(); } return TextProcessor::doTextVariation($untranslated, '[[', ']]', $translations); }
/** * Output information about a variable * * Context-sensitive replacement for the native var_dump() function. * If the xdebug extension is installed or if text/plain HTTP content * headers have been sent then var_dump() is used as is. Otherwise the * var_dump() call is wrapped with <pre> containter tags. * * @param mixed $var */ public static function dump($var) { // Check if it's actually okay to output anything if (true === DEBUG) { $text = new TextProcessor(); // Don't want to shoot our bolt too soon ob_start(); if (extension_loaded('xdebug')) { // Bliss, nothing extra for us to do var_dump($var); } elseif (headers_sent()) { $http = headers_list(); // Try to discover what kind of context we're in foreach ($http as $index => $header) { list($key, $val) = explode(': ', $header); if ('Content-type' == $key) { if ('text/plain' == substr($val, 0, 10)) { var_dump($var); } elseif ('text/html' == substr($val, 0, 9)) { echo $text->wrapWithTag($var, 'pre'); } break; } } } // Gather what output we may already have $info = ob_get_clean(); if (false === empty($info)) { // We have output already echo $info; } else { if ('text/html' == ini_get('default_mimetype')) { // If the default mimetype is HTML echo $text->wrapWithTag($var, 'pre'); } else { // Tried everything else, resort to simple var_dump var_dump($var); } } } }
/** * Used to extract the title, description and links from * a string consisting of Word Doc data (2004 or earlier). * * @param string $page the web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $text = ""; if (is_string($page)) { $text = self::extractASCIIText($page); } if ($text == "") { $text = $url; } $summary = parent::process($text, $url); return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { $dom = self::dom($page); if ($dom !== false) { $summary[self::TITLE] = $url; $summary[self::DESCRIPTION] = "Sitemap of " . $url; $summary[self::LANG] = "en-US"; $summary[self::LINKS] = self::links($dom, $url); if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not a sitemap? treat as text still try to get urls $summary = parent::process($page, $url); } $summary[self::JUST_METAS] = true; } else { $summary = parent::process($page, $url); $summary[self::JUST_METAS] = true; } } return $summary; }
/** * Returns summary of body of a web page based on crude regex matching * used as a fall back if dom parsing did not work. * * @param string $page to extract description from * @return string a title of the page */ static function crudeDescription($page) { $body = parent::getBetweenTags($page, 0, "<body", "</body"); $body = preg_replace("/\\</", " <", $body); $body = strip_tags("<body" . $body[1] . "</body>"); if ($body == "") { return $body; } $body = preg_replace("/\\s+/", " ", $body); return mb_substr($body, 0, self::$max_description_len); }
/** * Parses a mediawiki document to produce an HTML equivalent * * @param string $document a document which might have mediawiki markup * @param bool $parse_head_vars header variables are an extension of * mediawiki syntax used to add meta variable and titles to * the head tag of an html document. This flag controls whether to * supprot this extension or not * @param bool $handle_big_files for indexing purposes Yioop by default * truncates long documents before indexing them. If true, this * method does not do this default truncation. The true value * is more useful when using Yioop's built-in wiki. * @return string HTML document obtained by parsing mediawiki * markup in $document */ function parse($document, $parse_head_vars = true, $handle_big_files = false) { $head = ""; $page_type = "standard"; $head_vars = array(); $draw_toc = true; if ($parse_head_vars && !$this->minimal) { $document_parts = explode("END_HEAD_VARS", $document); if (count($document_parts) > 1) { $head = $document_parts[0]; $document = $document_parts[1]; $head_lines = preg_split("/\n\\s*\n/", $head); foreach ($head_lines as $line) { $semi_pos = strpos($line, ";") ? strpos($line, ";") : strlen($line); $line = substr($line, 0, $semi_pos); $line_parts = explode("=", $line); if (count($line_parts) == 2) { $head_vars[trim(addslashes($line_parts[0]))] = addslashes(trim($line_parts[1])); } } if (isset($head_vars['page_type'])) { $page_type = $head_vars['page_type']; } if (isset($head_vars['toc'])) { $draw_toc = $head_vars['toc']; } } } $document = preg_replace_callback("/<nowiki>(.+?)<\\/nowiki>/s", "base64EncodeCallback", $document); $document = preg_replace_callback("/<pre>(.+?)<\\/pre>/s", "spaceEncodeCallback", $document); if (!$this->minimal) { if ($draw_toc && $page_type != "presentation") { $toc = $this->makeTableOfContents($document); } list($document, $references) = $this->makeReferences($document); } $document = preg_replace_callback('/(\\A|\\n){\\|(.*?)\\n\\|}/s', "makeTableCallback", $document); if ($page_type == "presentation") { $lines = explode("\n", $document); $out_document = ""; $slide = ""; $div = "<div class='slide'>"; foreach ($lines as $line) { if (trim($line) == "....") { $slide = $this->processRegexes($slide); $out_document .= $div . $this->cleanLinksAndParagraphs($slide) . "</div>"; $slide = ""; } else { $slide .= $line . "\n"; } } $document = $out_document . $div . $this->processRegexes($slide) . "</div>"; } else { if ($handle_big_files) { $document = $this->processRegexes($document); $document = $this->cleanLinksAndParagraphs($document); } else { if (strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) { $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN); } $document = $this->processRegexes($document); $document = $this->cleanLinksAndParagraphs($document); } } if (!$this->minimal) { $document = $this->insertReferences($document, $references); if ($draw_toc && $page_type != "presentation") { $document = $this->insertTableOfContents($document, $toc); } } $document = preg_replace_callback("/<nowiki>(.+?)<\\/nowiki>/s", "base64DecodeCallback", $document); if ($head != "" && $parse_head_vars) { $document = $head . "END_HEAD_VARS" . $document; } if (!$handle_big_files && strlen($document) > 0.9 * MAX_GROUP_PAGE_LEN) { $document = substr($document, 0, 0.9 * MAX_GROUP_PAGE_LEN); TextProcessor::closeDanglingTags($document); $document .= "..."; } return $document; }
/** * Used to extract the title, description and links from * a string consisting of svg image. If the image is small * enough, an attempt is made to generate a thumbnail * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { if (is_string($page)) { self::closeDanglingTags($page); $dom = self::dom($page); if ($dom !== false && isset($dom->documentElement)) { $summary[self::TITLE] = ""; $summary[self::DESCRIPTION] = self::description($dom); $summary[self::LINKS] = array(); $summary[self::PAGE] = "<html><body><div><img src='data:image/svg+xml;base64," . base64_encode($page) . "' alt='" . $summary[self::DESCRIPTION] . "' /></div></body></html>"; if (strlen($page) < self::MAX_THUMB_LEN) { $thumb_string = self::createThumb($dom); $summary[self::THUMB] = 'data:image/svg+xml;base64,' . base64_encode($thumb_string); } } else { $summary = parent::process($page, $url); } } return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { self::closeDanglingTags($page); $dom = self::dom($page); $root_name = isset($dom->documentElement->nodeName) ? $dom->documentElement->nodeName : ""; unset($dom); $XML_PROCESSORS = array("rss" => "RssProcessor", "html" => "HtmlProcessor", "sitemapindex" => "SitemapProcessor", "urlset" => "SitemapProcessor", "svg" => "SvgProcessor"); if (isset($XML_PROCESSORS[$root_name])) { $processor_name = $XML_PROCESSORS[$root_name]; $processor = new $processor_name($this->plugin_instances); $summary = $processor->process($page, $url); } else { $summary = parent::process($page, $url); } } return $summary; }
/** * Computes a summary based on a string of a binary Powerpoint document * (as opposed to the modern xml powerpoint format). * * Text is extracted from the Powerpoint document using a crude finite * state machine that was developed by looking at a few Powerpoint * documents in a Hex editor. Then the TextProcessor:: process() method * is used to make a summary * * @param string $page string of a Powerpoint document * @param string $url location the document came from, not used by * TextProcessor at this point. Some of its subclasses override * this method and use url to produce complete links for * relative links within a document * * @return array a summary of (title, description,links, and content) of * the information in $page */ function process($page, $url) { $text = ""; if (is_string($page)) { $text_objects = array(); $cur_id = 0; $state = self::PPT_IGNORING; $cur_char_pos = 0; $len = strlen($page); while ($cur_char_pos < $len) { $ascii = ord($page[$cur_char_pos]); switch ($state) { case self::PPT_IGNORING: if ($ascii == 0) { $state = self::ZEROONE_IGNORING; } break; case self::ZEROONE_IGNORING: if ($ascii == 0) { $state = self::ZEROTWO_IGNORING; } else { $state = self::PPT_IGNORING; } break; case self::ZEROTWO_IGNORING: if ($ascii == 168) { $state = self::FIRST_CHAR_TEXT_SEG; } else { if ($ascii != 0) { $state = self::PPT_IGNORING; } } break; case self::FIRST_CHAR_TEXT_SEG: if ($ascii == 15) { $state = self::READ_LEN_TEXT_SEG; $text_len = 0; $text_len_pos = 0; } else { $state = self::PPT_IGNORING; } break; case self::READ_LEN_TEXT_SEG: if ($text_len_pos < 4) { $text_len += $ascii << $text_len_pos * 8; $text_len_pos++; } else { $state = self::SCAN_TEXT_SEG; $scan_text_pos = 0; $out_text = chr($ascii); } break; case self::SCAN_TEXT_SEG: if (strpos($out_text, "lick to edit Master title style") > 0) { $state = self::ALWAYS_IGNORE; } else { if ($scan_text_pos < $text_len) { if ($ascii >= 32 && $ascii <= 126 || $ascii == 10) { $out_text .= chr($ascii); $scan_text_pos++; } } else { $text_objects[$cur_id] = $out_text; $cur_id++; $state = self::PPT_IGNORING; } } break; case self::ALWAYS_IGNORE: break; } $cur_char_pos++; } $text = implode("\n", $text_objects); } if ($text == "") { $text = $url; } $summary = parent::process($text, $url); return $summary; }
/** * Used to extract the title, description and links from * a string consisting of rss or atom news feed data. * * @param string $page web-page contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; if (is_string($page)) { $dom = self::dom($page); $atom = false; $feed_nodes = $dom->getElementsByTagName('feed'); if ($feed_nodes->length > 0) { $atom = true; } if ($dom !== false) { $summary[self::TITLE] = self::title($dom, $atom); $summary[self::DESCRIPTION] = self::description($dom, $atom); $summary[self::LANG] = self::lang($dom, $summary[self::DESCRIPTION]); $summary[self::LINKS] = self::links($dom, $url, $atom); if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE]) == 0 && count($summary[self::LINKS]) == 0) { //maybe not rss or atom? treat as text still try to get urls $summary = parent::process($page, $url); } } } return $summary; }