public function testInvalidHTMLSaving() { $value = new SS_HTMLValue(); $invalid = array('<p>Enclosed Value</p></p>' => '<p>Enclosed Value</p>', '<p><div class="example"></div></p>' => '<p/><div class="example"/>', '<html><html><body><falsetag "attribute=""attribute""">' => '<falsetag/>', '<body<body<body>/bodu>/body>' => '/bodu>/body>'); foreach ($invalid as $input => $expected) { $value->setContent($input); $this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be saved'); } }
public function testMixedNewlines() { $value = new SS_HTMLValue(); $eol = "\n"; $platformEOL = PHP_EOL; // native EOL for platform. Windows is \r\n (CR-LF). UNIX is LF $value->setContent("<p>paragraph</p>{$platformEOL}<ul><li>1</li>\r\n</ul>"); $this->assertEquals("<p>paragraph</p>{$eol}<ul><li>1</li>{$eol}</ul>", $value->getContent(), 'Newlines get converted'); }
/** * Finds the links that are of interest for the link tracking automation. Checks for brokenness and attaches * extracted metadata so consumers can decide what to do with the DOM element (provided as DOMReference). * * @param SS_HTMLValue $htmlValue Object to parse the links from. * @return array Associative array containing found links with the following field layout: * Type: string, name of the link type * Target: any, a reference to the target object, depends on the Type * Anchor: string, anchor part of the link * DOMReference: DOMElement, reference to the link to apply changes. * Broken: boolean, a flag highlighting whether the link should be treated as broken. */ public function process(SS_HTMLValue $htmlValue) { $results = array(); $links = $htmlValue->getElementsByTagName('a'); if (!$links) { return $results; } foreach ($links as $link) { if (!$link->hasAttribute('href')) { continue; } $href = Director::makeRelative($link->getAttribute('href')); // Definitely broken links. if ($href == '' || $href[0] == '/') { $results[] = array('Type' => 'broken', 'Target' => null, 'Anchor' => null, 'DOMReference' => $link, 'Broken' => true); continue; } // Link to a page on this site. $matches = array(); if (preg_match('/\\[sitetree_link(?:\\s*|%20|,)?id=([0-9]+)\\](#(.*))?/i', $href, $matches)) { $page = DataObject::get_by_id('SiteTree', $matches[1]); if (!$page) { // Page doesn't exist. $broken = true; } else { if (!empty($matches[3]) && !preg_match("/(name|id)=\"{$matches[3]}\"/", $page->Content)) { // Broken anchor on the target page. $broken = true; } else { $broken = false; } } $results[] = array('Type' => 'sitetree', 'Target' => $matches[1], 'Anchor' => empty($matches[3]) ? null : $matches[3], 'DOMReference' => $link, 'Broken' => $broken); continue; } // Link to a file on this site. $matches = array(); if (preg_match('/\\[file_link(?:\\s*|%20|,)?id=([0-9]+)\\]/i', $href, $matches)) { $results[] = array('Type' => 'file', 'Target' => $matches[1], 'Anchor' => null, 'DOMReference' => $link, 'Broken' => !DataObject::get_by_id('File', $matches[1])); continue; } // Local anchor. $matches = array(); if (preg_match('/^#(.*)/i', $href, $matches)) { $results[] = array('Type' => 'localanchor', 'Target' => null, 'Anchor' => $matches[1], 'DOMReference' => $link, 'Broken' => !preg_match("#(name|id)=\"{$matches[1]}\"#", $htmlValue->getContent())); continue; } } return $results; }
public function saveInto($record) { if ($record->escapeTypeForField($this->name) != 'xml') { throw new Exception('HtmlEditorField->saveInto(): This field should save into a HTMLText or HTMLVarchar field.'); } $linkedPages = array(); $linkedFiles = array(); $htmlValue = new SS_HTMLValue($this->value); // Populate link tracking for internal links & links to asset files. if ($links = $htmlValue->getElementsByTagName('a')) { foreach ($links as $link) { $href = Director::makeRelative($link->getAttribute('href')); if ($href) { if (preg_match('/\\[sitetree_link id=([0-9]+)\\]/i', $href, $matches)) { $ID = $matches[1]; // clear out any broken link classes if ($class = $link->getAttribute('class')) { $link->setAttribute('class', preg_replace('/(^ss-broken|ss-broken$| ss-broken )/', null, $class)); } $linkedPages[] = $ID; if (!DataObject::get_by_id('SiteTree', $ID)) { $record->HasBrokenLink = true; } } else { if (substr($href, 0, strlen(ASSETS_DIR) + 1) == ASSETS_DIR . '/') { $candidateFile = File::find(Convert::raw2sql(urldecode($href))); if ($candidateFile) { $linkedFiles[] = $candidateFile->ID; } else { $record->HasBrokenFile = true; } } else { if ($href == '' || $href[0] == '/') { $record->HasBrokenLink = true; } } } } } } // Resample images, add default attributes and add to assets tracking. if ($images = $htmlValue->getElementsByTagName('img')) { foreach ($images as $img) { // strip any ?r=n data from the src attribute $img->setAttribute('src', preg_replace('/([^\\?]*)\\?r=[0-9]+$/i', '$1', $img->getAttribute('src'))); if (!($image = File::find($path = urldecode(Director::makeRelative($img->getAttribute('src')))))) { if (substr($path, 0, strlen(ASSETS_DIR) + 1) == ASSETS_DIR . '/') { $record->HasBrokenFile = true; } continue; } // Resample the images if the width & height have changed. $width = $img->getAttribute('width'); $height = $img->getAttribute('height'); if ($image) { if ($width && $height && ($width != $image->getWidth() || $height != $image->getHeight())) { //Make sure that the resized image actually returns an image: $resized = $image->ResizedImage($width, $height); if ($resized) { $img->setAttribute('src', $resized->getRelativePath()); } } } // Add default empty title & alt attributes. if (!$img->getAttribute('alt')) { $img->setAttribute('alt', ''); } if (!$img->getAttribute('title')) { $img->setAttribute('title', ''); } //If the src attribute is not set, then we won't add this to the list: if ($img->getAttribute('src')) { // Add to the tracked files. $linkedFiles[] = $image->ID; } } } // Save file & link tracking data. if ($record->ID && $record->many_many('LinkTracking') && ($tracker = $record->LinkTracking())) { $filter = sprintf('"FieldName" = \'%s\' AND "SiteTreeID" = %d', $this->name, $record->ID); DB::query("DELETE FROM \"{$tracker->tableName}\" WHERE {$filter}"); if ($linkedPages) { foreach ($linkedPages as $item) { $SQL_fieldName = Convert::raw2sql($this->name); DB::query("INSERT INTO \"SiteTree_LinkTracking\" (\"SiteTreeID\",\"ChildID\", \"FieldName\")\n\t\t\t\t\tVALUES ({$record->ID}, {$item}, '{$SQL_fieldName}')"); } } } if ($record->ID && $record->many_many('ImageTracking') && ($tracker = $record->ImageTracking())) { $filter = sprintf('"FieldName" = \'%s\' AND "SiteTreeID" = %d', $this->name, $record->ID); DB::query("DELETE FROM \"{$tracker->tableName}\" WHERE {$filter}"); $fieldName = $this->name; if ($linkedFiles) { foreach ($linkedFiles as $item) { $tracker->add($item, array('FieldName' => $this->name)); } } } $record->{$this->name} = $htmlValue->getContent(); }
public function cleanHTML($content) { $html = new HTMLPurifier(); $doc = new SS_HTMLValue($html->purify($content)); return $doc->getContent(); }
/** * Attempt to clean invalid HTML, which messes up diffs. * This cleans code if possible, using an instance of HTMLCleaner * * NB: By default, only extremely simple tidying is performed, * by passing through DomDocument::loadHTML and saveXML * * @param string $content HTML content * @param object $cleaner Optional instance of a HTMLCleaner class to * use, overriding self::$html_cleaner_class */ public static function cleanHTML($content, $cleaner = null) { if (!$cleaner) { if (class_exists(self::$html_cleaner_class)) { $cleaner = new self::$html_cleaner_class(); } else { $cleaner = HTMLCleaner::inst(); //load cleaner if the dependent class is available } } if ($cleaner) { $content = $cleaner->cleanHTML($content); } else { // At most basic level of cleaning, use DOMDocument to save valid XML. $doc = new SS_HTMLValue($content); $content = $doc->getContent(); } // Remove empty <ins /> and <del /> tags because browsers hate them $content = preg_replace('/<(ins|del)[^>]*\\/>/', '', $content); return $content; }
public function testMixedNewlines() { $value = new SS_HTMLValue(); $value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>"); $this->assertEquals("<p>paragraph</p>\n<ul><li>1</li>\n</ul>", $value->getContent(), 'Newlines get converted'); }
/** * Cleans and returns XHTML which is needed for use in DOMDocument * * @param type $content * @param type $encoding * @return string */ protected function tidy($content, $encoding = 'UTF-8') { // Try to use the extension first if (extension_loaded('tidy')) { $tidy = tidy_parse_string($content, array('clean' => true, 'output-xhtml' => true, 'show-body-only' => false, 'wrap' => 0, 'input-encoding' => $encoding, 'output-encoding' => $encoding, 'doctype' => 'omit', 'anchor-as-name' => false)); $tidy->cleanRepair(); return $this->rewriteShortcodes('' . $tidy); } // No PHP extension available, attempt to use CLI tidy. $retval = null; $output = null; @exec('tidy --version', $output, $retval); if ($retval === 0) { $tidy = ''; $input = escapeshellarg($content); $encoding = str_replace('-', '', $encoding); $encoding = escapeshellarg($encoding); // Doesn't work on Windows, sorry, stick to the extension. $tidy = @`echo {$input} | tidy -q --show-body-only no --tidy-mark no --doctype omit --input-encoding {$encoding} --output-encoding {$encoding} --wrap 0 --anchor-as-name no --clean yes --output-xhtml yes`; return $this->rewriteShortcodes($tidy); } // Fall back to default $doc = new SS_HTMLValue($content); return $doc->getContent(); }