/** * {@inheritdoc} */ public function sanitise($value, $isWysiwyg = false) { $allowedTags = $isWysiwyg ? $this->getWyswigAllowedTags() : $this->getAllowedTags(); // Check if the input containts encoded HTML entities. If it does, we'll // need to decode the output later. This is because the sanitiser will // convert entities in the cleaned HTML, if they aren't present yet. // Ideally we'd fix this upstream by using \DomDocument::substituteEntities, // but that setting is disregarded in PHP's implementation at least. // This leaves us no choice but to implement this crude, albeit contained // fix in this location. $needsDecodeEntities = $value === html_entity_decode($value, ENT_NOQUOTES); $maid = new Maid(['output-format' => 'html', 'allowed-tags' => $allowedTags, 'allowed-attribs' => $this->getAllowedAttributes()]); $output = $maid->clean($value); if ($needsDecodeEntities) { $output = html_entity_decode($output, ENT_NOQUOTES); } return $output; }
/** * Creates RSS safe content. Wraps it in CDATA tags, strips style and * scripts out. Can optionally also return a (cleaned) excerpt. * * @param Content $record Bolt Content object * @param string $fields Comma separated list of fields to clean up * @param integer $excerptLength Number of chars of the excerpt * * @return string RSS safe string */ public function ampSafe($record, $fields = '', $excerptLength = 0) { // Make sure we have an array of fields. Even if it's only one. if (!is_array($fields)) { $fields = explode(',', $fields); } $fields = array_map('trim', $fields); $result = ''; foreach ($fields as $field) { if (!array_key_exists($field, $record->values)) { continue; } // Completely remove style and script blocks $maid = new Maid(['output-format' => 'html', 'allowed-tags' => ['a', 'b', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'p', 'strong', 'em', 'i', 'u', 'strike', 'ul', 'ol', 'li', 'img'], 'allowed-attribs' => ['id', 'class', 'name', 'value', 'href', 'src']]); $result .= $maid->clean($record->values[$field]); } if ($excerptLength > 0) { $result = Html::trimText($result, $excerptLength); } return new \Twig_Markup('<![CDATA[ ' . $result . ' ]]>', 'utf-8'); }
/** * Truncate a given HTML fragment to the desired length (measured as character * count), additionally performing some cleanup. * * @param string $html The HTML fragment to clean up * @param int $desiredLength The desired number of characters, or NULL to do * just the cleanup (but no truncating). * @param string $ellipseStr If non-empty, this string will be appended to the * last collected node when the document gets * truncated. * @param bool $stripTags If TRUE, remove *all* HTML tags. Otherwise, keep a * whitelisted 'safe' set. * @param bool $nbsp If TRUE, convert all whitespace runs to non-breaking * spaces (' ' entities). */ function trimToHTML($html, $desiredLength = null, $ellipseStr = "…", $stripTags = false, $nbsp = false) { // We'll use htmlmaid to clean up the HTML, but because we also have to // step through the DOM ourselves to perform the trimming, so we'll do // the DOM loading ourselves, rather than leave it to Maid. // Do not load external entities - this would be a security risk. $prevEntityLoaderDisabled = libxml_disable_entity_loader(true); // Don't crash on invalid HTML, but recover gracefully $prevInternalErrors = libxml_use_internal_errors(true); $doc = new \DOMDocument(); // We need a bit of wrapping here to keep DOMDocument from adding rogue nodes // around our HTML. By doing it explicitly, we keep things under control. $doc->loadHTML('<!DOCTYPE html><html>' . '<head><meta http-equiv="Content-type" content="text/html;charset=utf-8"/></head>' . '<body><div>' . $html . '</div></body>' . '</html>'); $options = array(); if ($stripTags) { $options['allowed-tags'] = array(); } else { $options['allowed-tags'] = array('a', 'div', 'p', 'b', 'i', 'hr', 'br', 'strong', 'em'); } $options['allowed-attribs'] = array('href', 'src', 'id', 'class', 'style'); $maid = new Maid($options); $cleanedNodes = $maid->clean($doc->documentElement->firstChild->nextSibling->firstChild); // To collect the cleaned nodes from a node list into a containing node, // we have to create yet another document, because cloning nodes inside // the same ownerDocument for some reason modifies our node list. // I have no idea why, but it does. $cleanedDoc = new \DOMDocument(); $cleanedNode = $cleanedDoc->createElement('div'); $length = $cleanedNodes->length; for ($i = 0; $i < $length; ++$i) { $node = $cleanedNodes->item($i); $cnode = $cleanedDoc->importNode($node, true); $cleanedNode->appendChild($cnode); } // And now we'll create yet another document (who's keeping count?) to // collect our trimmed nodes. $newDoc = new \DOMDocument(); // Again, some wrapping is necessary here... $newDoc->loadHTML('<html><body><div></div></body></html>'); $newNode = $newDoc->documentElement->firstChild->firstChild; $length = $desiredLength; _collectNodesUpToLength($cleanedNode, $newNode, $length, $ellipseStr); // Convert spaces inside text nodes to // This will actually insert the unicode non-breaking space, so we'll have // to massage our output at the HTML byte-string level later. if ($nbsp) { domSpacesToNBSP($newNode->firstChild->firstChild); } // This is some terrible shotgun hacking; for some reason, the above code // will sometimes put our desired nodes two levels deep, but in other // cases, it'll descend one less level. The proper solution would be // to sort out why this is, but for now, just detecting which of the // two happened seems to work well enough. if (isset($newNode->firstChild->firstChild->childNodes)) { $nodes = $newNode->firstChild->firstChild->childNodes; } elseif (isset($newNode->firstChild->childNodes)) { $nodes = $newNode->firstChild->childNodes; } else { $nodes = array(); } // And now we convert our target nodes to HTML. // Because we don't want any of the wrapper nodes to appear in the // output, we'll have to convert them one by one and concatenate the // HTML. $result = ''; foreach ($nodes as $node) { $result .= Maid::renderFragment($node); } if ($nbsp) { $result = str_replace(html_entity_decode(' '), ' ', $result); } // Restore previous libxml settings libxml_disable_entity_loader($prevEntityLoaderDisabled); libxml_use_internal_errors($prevInternalErrors); return $result; }
/** * Get the decoded version of a value of the current object. * * @param string $name name of the value to get * * @return mixed The decoded value or null when no value available */ public function getDecodedValue($name) { $value = null; if (isset($this->values[$name])) { $fieldtype = $this->fieldtype($name); $fieldinfo = $this->fieldinfo($name); $allowtwig = !empty($fieldinfo['allowtwig']); switch ($fieldtype) { case 'markdown': $value = $this->preParse($this->values[$name], $allowtwig); // Parse the field as Markdown, return HTML $value = $this->app['markdown']->text($value); $config = $this->app['config']->get('general/htmlcleaner'); $allowed_tags = !empty($config['allowed_tags']) ? $config['allowed_tags'] : ['div', 'p', 'br', 'hr', 's', 'u', 'strong', 'em', 'i', 'b', 'li', 'ul', 'ol', 'blockquote', 'pre', 'code', 'tt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd', 'dl', 'dt', 'table', 'tbody', 'thead', 'tfoot', 'th', 'td', 'tr', 'a', 'img']; $allowed_attributes = !empty($config['allowed_attributes']) ? $config['allowed_attributes'] : ['id', 'class', 'name', 'value', 'href', 'src']; // Sanitize/clean the HTML. $maid = new Maid(['output-format' => 'html', 'allowed-tags' => $allowed_tags, 'allowed-attribs' => $allowed_attributes]); $value = $maid->clean($value); $value = new \Twig_Markup($value, 'UTF-8'); break; case 'html': case 'text': case 'textarea': $value = $this->preParse($this->values[$name], $allowtwig); $value = new \Twig_Markup($value, 'UTF-8'); break; case 'imagelist': case 'filelist': if (is_string($this->values[$name])) { // Parse the field as JSON, return the array $value = json_decode($this->values[$name]); } else { // Already an array, do nothing. $value = $this->values[$name]; } break; case 'image': if (is_array($this->values[$name]) && isset($this->values[$name]['file'])) { $value = $this->values[$name]['file']; } else { $value = $this->values[$name]; } break; default: $value = $this->values[$name]; break; } } return $value; }
/** * Formats the given string as Markdown in HTML. * * @param string $content * * @return string Markdown output */ public function markdown($content) { // Parse the field as Markdown, return HTML $output = $this->app['markdown']->text($content); $config = $this->app['config']->get('general/htmlcleaner'); $allowed_tags = !empty($config['allowed_tags']) ? $config['allowed_tags'] : ['div', 'p', 'br', 'hr', 's', 'u', 'strong', 'em', 'i', 'b', 'li', 'ul', 'ol', 'blockquote', 'pre', 'code', 'tt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd', 'dl', 'dh', 'table', 'tbody', 'thead', 'tfoot', 'th', 'td', 'tr', 'a', 'img']; $allowed_attributes = !empty($config['allowed_attributes']) ? $config['allowed_attributes'] : ['id', 'class', 'name', 'value', 'href', 'src']; // Sanitize/clean the HTML. $maid = new Maid(['output-format' => 'html', 'allowed-tags' => $allowed_tags, 'allowed-attribs' => $allowed_attributes]); $output = $maid->clean($output); return $output; }
/** * {@inheritdoc} */ public function sanitise($value, $isWysiwyg = false) { $allowedTags = $isWysiwyg ? $this->getWyswigAllowedTags() : $this->getAllowedTags(); $maid = new Maid(['output-format' => 'html', 'allowed-tags' => $allowedTags, 'allowed-attribs' => $this->getAllowedAttributes()]); return $maid->clean($value); }