public function testInvalidHTMLTagNames() { $value = new SS_HTMLValue(); $invalid = array('<p><div><a href="test-link"></p></div>', '<html><div><a href="test-link"></a></a></html_>', '""\'\'\'"""\'""<<<>/</<htmlbody><a href="test-link"<<>'); foreach ($invalid as $input) { $value->setContent($input); $this->assertEquals('test-link', $value->getElementsByTagName('a')->item(0)->getAttribute('href'), 'Link data can be extraced from malformed HTML'); } }
public function testMixedNewlines() { $value = new SS_HTMLValue(); $eol = "\n"; $platformEOL = PHP_EOL; // native EOL for platform. Windows is \r\n (CR-LF). UNIX is LF $value->setContent("<p>paragraph</p>{$platformEOL}<ul><li>1</li>\r\n</ul>"); $this->assertEquals("<p>paragraph</p>{$eol}<ul><li>1</li>{$eol}</ul>", $value->getContent(), 'Newlines get converted'); }
public function onBeforeSend() { $email = $this->owner; $letter = $email->Newsletter(); $body = new SS_HTMLValue($email->Body()->forTemplate()); $links = array(); $member = null; if (!$body || !$letter) { return; } if ($email->To()) { $member = DataObject::get_one('Member', sprintf('"Email" = \'%s\'', Convert::raw2sql($email->To()))); } // First build up a set of all the unique links within the newsletter, // along with the elements that link to them. foreach ($body->getElementsByTagName('a') as $link) { $href = $link->getAttribute('href'); if (strpos($href, '{$') !== false || strpos($href, 'mailto:') !== false) { // ignore links with keywords continue; } if (array_key_exists($href, $links)) { $links[$href][] = $link; } else { $links[$href] = array($link); } } // Then actually do the processing. Create a unique tracking object for // each link. Attempt to embed a member-specific tracking token if // the newsletter is being sent to a member. foreach ($links as $href => $elements) { $track = DataObject::get_one('Newsletter_TrackedLink', sprintf('"NewsletterID" = %d AND "Original" = \'%s\'', $letter->ID, Convert::raw2sql($href))); if (!$track) { $track = new Newsletter_TrackedLink(); $track->Original = $href; $track->NewsletterID = $letter->ID; $track->write(); } if ($member) { $trackHref = Controller::join_links(Director::baseURL(), 'newsletter-link', $member->NewsletterTrackingToken, $track->Hash); } else { $trackHref = Controller::join_links(Director::baseURL(), 'newsletter-link', $track->Hash); } foreach ($elements as $element) { $element->setAttribute('href', $trackHref); } } $dom = $body->getDocument(); $email->setBody(DBField::create('HTMLText', $dom->saveHTML())); }
/** * Given an SS_HTMLValue instance, will remove and elements and attributes that are * not explicitly included in the whitelist passed to __construct on instance creation * * @param SS_HTMLValue $html - The HTMLValue to remove any non-whitelisted elements & attributes from */ public function sanitise(SS_HTMLValue $html) { if (!$this->elements && !$this->elementPatterns) { return; } $doc = $html->getDocument(); foreach ($html->query('//body//*') as $el) { $elementRule = $this->getRuleForElement($el->tagName); // If this element isn't allowed, strip it if (!$this->elementMatchesRule($el, $elementRule)) { // If it's a script or style, we don't keep contents if ($el->tagName === 'script' || $el->tagName === 'style') { $el->parentNode->removeChild($el); } else { // First, create a new fragment with all of $el's children moved into it $frag = $doc->createDocumentFragment(); while ($el->firstChild) { $frag->appendChild($el->firstChild); } // Then replace $el with the frags contents (which used to be it's children) $el->parentNode->replaceChild($frag, $el); } } else { // First, if we're supposed to pad & this element is empty, fix that if ($elementRule->paddEmpty && !$el->firstChild) { $el->nodeValue = ' '; } // Then filter out any non-whitelisted attributes $children = $el->attributes; $i = $children->length; while ($i--) { $attr = $children->item($i); $attributeRule = $this->getRuleForAttribute($elementRule, $attr->name); // If this attribute isn't allowed, strip it if (!$this->attributeMatchesRule($attr, $attributeRule)) { $el->removeAttributeNode($attr); } } // Then enforce any default attributes foreach ($elementRule->attributesDefault as $attr => $default) { if (!$el->getAttribute($attr)) { $el->setAttribute($attr, $default); } } // And any forced attributes foreach ($elementRule->attributesForced as $attr => $forced) { $el->setAttribute($attr, $forced); } } } }
/** * Finds the links that are of interest for the link tracking automation. Checks for brokenness and attaches * extracted metadata so consumers can decide what to do with the DOM element (provided as DOMReference). * * @param SS_HTMLValue $htmlValue Object to parse the links from. * @return array Associative array containing found links with the following field layout: * Type: string, name of the link type * Target: any, a reference to the target object, depends on the Type * Anchor: string, anchor part of the link * DOMReference: DOMElement, reference to the link to apply changes. * Broken: boolean, a flag highlighting whether the link should be treated as broken. */ public function process(SS_HTMLValue $htmlValue) { $results = array(); $links = $htmlValue->getElementsByTagName('a'); if (!$links) { return $results; } foreach ($links as $link) { if (!$link->hasAttribute('href')) { continue; } $href = Director::makeRelative($link->getAttribute('href')); // Definitely broken links. if ($href == '' || $href[0] == '/') { $results[] = array('Type' => 'broken', 'Target' => null, 'Anchor' => null, 'DOMReference' => $link, 'Broken' => true); continue; } // Link to a page on this site. $matches = array(); if (preg_match('/\\[sitetree_link(?:\\s*|%20|,)?id=([0-9]+)\\](#(.*))?/i', $href, $matches)) { $page = DataObject::get_by_id('SiteTree', $matches[1]); if (!$page) { // Page doesn't exist. $broken = true; } else { if (!empty($matches[3]) && !preg_match("/(name|id)=\"{$matches[3]}\"/", $page->Content)) { // Broken anchor on the target page. $broken = true; } else { $broken = false; } } $results[] = array('Type' => 'sitetree', 'Target' => $matches[1], 'Anchor' => empty($matches[3]) ? null : $matches[3], 'DOMReference' => $link, 'Broken' => $broken); continue; } // Link to a file on this site. $matches = array(); if (preg_match('/\\[file_link(?:\\s*|%20|,)?id=([0-9]+)\\]/i', $href, $matches)) { $results[] = array('Type' => 'file', 'Target' => $matches[1], 'Anchor' => null, 'DOMReference' => $link, 'Broken' => !DataObject::get_by_id('File', $matches[1])); continue; } // Local anchor. $matches = array(); if (preg_match('/^#(.*)/i', $href, $matches)) { $results[] = array('Type' => 'localanchor', 'Target' => null, 'Anchor' => $matches[1], 'DOMReference' => $link, 'Broken' => !preg_match("#(name|id)=\"{$matches[1]}\"#", $htmlValue->getContent())); continue; } } return $results; }
public function saveInto($record) { if ($record->escapeTypeForField($this->name) != 'xml') { throw new Exception('HtmlEditorField->saveInto(): This field should save into a HTMLText or HTMLVarchar field.'); } $linkedPages = array(); $linkedFiles = array(); $htmlValue = new SS_HTMLValue($this->value); // Populate link tracking for internal links & links to asset files. if ($links = $htmlValue->getElementsByTagName('a')) { foreach ($links as $link) { $href = Director::makeRelative($link->getAttribute('href')); if ($href) { if (preg_match('/\\[sitetree_link id=([0-9]+)\\]/i', $href, $matches)) { $ID = $matches[1]; // clear out any broken link classes if ($class = $link->getAttribute('class')) { $link->setAttribute('class', preg_replace('/(^ss-broken|ss-broken$| ss-broken )/', null, $class)); } $linkedPages[] = $ID; if (!DataObject::get_by_id('SiteTree', $ID)) { $record->HasBrokenLink = true; } } else { if (substr($href, 0, strlen(ASSETS_DIR) + 1) == ASSETS_DIR . '/') { $candidateFile = File::find(Convert::raw2sql(urldecode($href))); if ($candidateFile) { $linkedFiles[] = $candidateFile->ID; } else { $record->HasBrokenFile = true; } } else { if ($href == '' || $href[0] == '/') { $record->HasBrokenLink = true; } } } } } } // Resample images, add default attributes and add to assets tracking. if ($images = $htmlValue->getElementsByTagName('img')) { foreach ($images as $img) { // strip any ?r=n data from the src attribute $img->setAttribute('src', preg_replace('/([^\\?]*)\\?r=[0-9]+$/i', '$1', $img->getAttribute('src'))); if (!($image = File::find($path = urldecode(Director::makeRelative($img->getAttribute('src')))))) { if (substr($path, 0, strlen(ASSETS_DIR) + 1) == ASSETS_DIR . '/') { $record->HasBrokenFile = true; } continue; } // Resample the images if the width & height have changed. $width = $img->getAttribute('width'); $height = $img->getAttribute('height'); if ($image) { if ($width && $height && ($width != $image->getWidth() || $height != $image->getHeight())) { //Make sure that the resized image actually returns an image: $resized = $image->ResizedImage($width, $height); if ($resized) { $img->setAttribute('src', $resized->getRelativePath()); } } } // Add default empty title & alt attributes. if (!$img->getAttribute('alt')) { $img->setAttribute('alt', ''); } if (!$img->getAttribute('title')) { $img->setAttribute('title', ''); } //If the src attribute is not set, then we won't add this to the list: if ($img->getAttribute('src')) { // Add to the tracked files. $linkedFiles[] = $image->ID; } } } // Save file & link tracking data. if ($record->ID && $record->many_many('LinkTracking') && ($tracker = $record->LinkTracking())) { $filter = sprintf('"FieldName" = \'%s\' AND "SiteTreeID" = %d', $this->name, $record->ID); DB::query("DELETE FROM \"{$tracker->tableName}\" WHERE {$filter}"); if ($linkedPages) { foreach ($linkedPages as $item) { $SQL_fieldName = Convert::raw2sql($this->name); DB::query("INSERT INTO \"SiteTree_LinkTracking\" (\"SiteTreeID\",\"ChildID\", \"FieldName\")\n\t\t\t\t\tVALUES ({$record->ID}, {$item}, '{$SQL_fieldName}')"); } } } if ($record->ID && $record->many_many('ImageTracking') && ($tracker = $record->ImageTracking())) { $filter = sprintf('"FieldName" = \'%s\' AND "SiteTreeID" = %d', $this->name, $record->ID); DB::query("DELETE FROM \"{$tracker->tableName}\" WHERE {$filter}"); $fieldName = $this->name; if ($linkedFiles) { foreach ($linkedFiles as $item) { $tracker->add($item, array('FieldName' => $this->name)); } } } $record->{$this->name} = $htmlValue->getContent(); }
public function cleanHTML($content) { $html = new HTMLPurifier(); $doc = new SS_HTMLValue($html->purify($content)); return $doc->getContent(); }
/** * Attempt to clean invalid HTML, which messes up diffs. * This cleans code if possible, using an instance of HTMLCleaner * * NB: By default, only extremely simple tidying is performed, * by passing through DomDocument::loadHTML and saveXML * * @param string $content HTML content * @param object $cleaner Optional instance of a HTMLCleaner class to * use, overriding self::$html_cleaner_class */ public static function cleanHTML($content, $cleaner = null) { if (!$cleaner) { if (class_exists(self::$html_cleaner_class)) { $cleaner = new self::$html_cleaner_class(); } else { $cleaner = HTMLCleaner::inst(); //load cleaner if the dependent class is available } } if ($cleaner) { $content = $cleaner->cleanHTML($content); } else { // At most basic level of cleaning, use DOMDocument to save valid XML. $doc = new SS_HTMLValue($content); $content = $doc->getContent(); } // Remove empty <ins /> and <del /> tags because browsers hate them $content = preg_replace('/<(ins|del)[^>]*\\/>/', '', $content); return $content; }
public function testMixedNewlines() { $value = new SS_HTMLValue(); $value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>"); $this->assertEquals("<p>paragraph</p>\n<ul><li>1</li>\n</ul>", $value->getContent(), 'Newlines get converted'); }
/** * Cleans and returns XHTML which is needed for use in DOMDocument * * @param type $content * @param type $encoding * @return string */ protected function tidy($content, $encoding = 'UTF-8') { // Try to use the extension first if (extension_loaded('tidy')) { $tidy = tidy_parse_string($content, array('clean' => true, 'output-xhtml' => true, 'show-body-only' => false, 'wrap' => 0, 'input-encoding' => $encoding, 'output-encoding' => $encoding, 'doctype' => 'omit', 'anchor-as-name' => false)); $tidy->cleanRepair(); return $this->rewriteShortcodes('' . $tidy); } // No PHP extension available, attempt to use CLI tidy. $retval = null; $output = null; @exec('tidy --version', $output, $retval); if ($retval === 0) { $tidy = ''; $input = escapeshellarg($content); $encoding = str_replace('-', '', $encoding); $encoding = escapeshellarg($encoding); // Doesn't work on Windows, sorry, stick to the extension. $tidy = @`echo {$input} | tidy -q --show-body-only no --tidy-mark no --doctype omit --input-encoding {$encoding} --output-encoding {$encoding} --wrap 0 --anchor-as-name no --clean yes --output-xhtml yes`; return $this->rewriteShortcodes($tidy); } // Fall back to default $doc = new SS_HTMLValue($content); return $doc->getContent(); }