public function parse($html) { // replace entities $html = preg_replace('/&([a-z0-9#]{2,5});/i', '+$1;', $html); //before sending to xml parser make sure we have valid xml by tidying it up $html = Kwf_Util_Tidy::repairHtml($html); $this->_stack = array(); $this->_ret = ''; $this->_parser = xml_parser_create(); xml_set_object($this->_parser, $this); xml_set_element_handler($this->_parser, 'startElement', 'endElement'); xml_set_character_data_handler($this->_parser, 'characterData'); xml_set_default_handler($this->_parser, 'characterData'); $result = xml_parse($this->_parser, '<body>' . $html . '</body>', true); if (!$result) { // wenn man ein nicht geschlossenes <br> rein gibt, schreit er hier, // macht aber normal weiter. wenns zu oft vorkommt, evtl. exception // entfernen und ignorieren, oder was andres überlegen :-) $errorCode = xml_get_error_code($this->_parser); $ex = new Kwf_Exception("HtmlExport UrlParser XML Error {$errorCode}: " . xml_error_string($errorCode) . "in line " . xml_get_current_line_number($this->_parser) . " parsed html: " . $html); $ex->logOrThrow(); } // re-replace entities $this->_ret = preg_replace('/\\+([a-z0-9#]{2,5});/i', '&$1;', $this->_ret); return $this->_ret; }
public function tidy($html, Kwc_Basic_Text_Parser $parser = null) { //convert umlauts from NFD to NFC $html = str_replace('u' . chr(0xcc) . chr(0x88), 'ü', $html); $html = str_replace('a' . chr(0xcc) . chr(0x88), 'ä', $html); $html = str_replace('o' . chr(0xcc) . chr(0x88), 'ö', $html); $html = str_replace('U' . chr(0xcc) . chr(0x88), 'Ü', $html); $html = str_replace('A' . chr(0xcc) . chr(0x88), 'Ä', $html); $html = str_replace('O' . chr(0xcc) . chr(0x88), 'Ö', $html); //delete zero width space, causes problems in Lotus Notes $html = str_replace(chr(0xe2) . chr(0x80) . chr(0x8b), '', $html); //delete BOM that might have sneaked into the text (at any position) $html = str_replace(chr(0xef) . chr(0xbb) . chr(0xbf), '', $html); $enableTidy = Kwc_Abstract::getSetting($this->_componentClass, 'enableTidy'); $enableFontSize = Kwc_Abstract::getSetting($this->_componentClass, 'enableFontSize'); $config = array(); if (!$enableFontSize) { $config['drop-font-tags'] = true; } if ($enableTidy) { //woraround für tidy bug wo er zwei class-attribute in einen //tag schreibt wenn eins davon leer ist //siehe Kwc_Basic_Text_ModelContentTest::testTidyRemovesSomeText //einfach leere klassen löschen $html = preg_replace('#<(.[a-z]+) ([^>]*)class=""([^>]*)>#', '<\\1 \\2 \\3>', $html); //html kommentare löschen, löscht auch word schas mit $html = preg_replace('#<!--.*?-->#s', '', $html); $html = str_replace('data-mce-type="bookmark"', 'class="_mce_type-bookmark"', $html); $html = str_replace(' ', '#nbsp#', $html); //einstellungen oben funktionieren nicht richtig $html = Kwf_Util_Tidy::repairHtml($html, $config); if (!$parser) { $parser = new Kwc_Basic_Text_Parser($this->componentId, $this->getModel()); $parser->setMasterStyles(Kwc_Basic_Text_StylesModel::getMasterStyles()); } $parser->setEnableColor(Kwc_Abstract::getSetting($this->_componentClass, 'enableColors')); $parser->setEnableTagsWhitelist(Kwc_Abstract::getSetting($this->_componentClass, 'enableTagsWhitelist')); $parser->setEnableStyles(Kwc_Abstract::getSetting($this->_componentClass, 'enableStyles')); $html = $parser->parse($html); $html = Kwf_Util_Tidy::repairHtml($html, $config); $html = str_replace('class="_mce_type-bookmark"', 'data-mce-type="bookmark"', $html); $html = str_replace('#nbsp#', ' ', $html); } $classes = $this->_classes; $newContent = ''; foreach ($this->getContentParts($html) as $part) { if (is_string($part)) { $newContent .= $part; } else { if ($part['type'] == 'invalidImage') { if (isset($part['componentId']) && class_exists($part['componentClass']) && (strtolower($part['componentClass']) == 'kwc_basic_image_component' || is_subclass_of($part['componentClass'], 'Kwc_Basic_Image_Component'))) { $srcRow = Kwc_Abstract::createModel($part['componentClass'])->getRow($part['componentId']); if ($srcRow->imageExists()) { $destRow = Kwc_Abstract::createModel($classes['image'])->createRow($srcRow->toArray()); $childComponentRow = $this->addChildComponentRow('image', $destRow); $destRow->save(); $imageComponent = Kwf_Component_Data_Root::getInstance()->getComponentByDbId($this->component_id . '-i' . $childComponentRow->nr)->getComponent(); $dimension = $imageComponent->getImageDimensions(); $newContent .= "<img src=\"" . $imageComponent->getImageUrl() . "\" " . "width=\"{$dimension['width']}\" " . "height=\"{$dimension['height']}\" />"; continue; } } $client = new Zend_Http_Client(); try { $client->setUri($part['src']); } catch (Zend_Uri_Exception $e) { //wann relative url mit http_host davor probieren if (isset($_SERVER['HTTP_HOST'])) { $client->setUri('http://' . $_SERVER['HTTP_HOST'] . '/' . $part['src']); } } try { $response = $client->request(); } catch (Exception $e) { continue; } if (!$response->isSuccessful()) { continue; } $contentType = $response->getHeader('Content-type'); if ($contentType == 'image/jpg' || $contentType == 'image/jpeg') { $extension = 'jpg'; } else { if ($contentType == 'image/gif') { $extension = 'gif'; } else { if ($contentType == 'image/png') { $extension = 'png'; } else { continue; } } } $destFileRow = Kwc_Abstract::createModel($classes['image'])->getReferencedModel('Image')->createRow(); $path = explode('?', $part['src']); if (preg_match('#([^/]*)\\.[a-z]+$#U', $path[0], $m)) { $srcFileName = Zend_Filter::filterStatic($m[1], 'Alnum', array(ENT_QUOTES)); } if (!isset($srcFileName) || !$srcFileName) { $srcFileName = 'download'; } $destFileRow->writeFile($response->getBody(), $srcFileName, $extension, $contentType); $destRow = Kwc_Abstract::createModel($classes['image'])->createRow(); $destRow->kwf_upload_id = $destFileRow->id; $size = getimagesize($destFileRow->getFileSource()); $destRow->width = $size[0]; $destRow->height = $size[1]; $destRow->filename = $srcFileName; $destRow->cover = true; $childComponentRow = $this->addChildComponentRow('image', $destRow); $destRow->save(); $imageComponent = Kwf_Component_Data_Root::getInstance()->getComponentByDbId($this->component_id . '-i' . $childComponentRow->nr)->getComponent(); $dimension = $imageComponent->getImageDimensions(); $newContent .= "<img src=\"" . $imageComponent->getImageUrl() . "\" " . "width=\"{$dimension['width']}\" " . "height=\"{$dimension['height']}\" />"; } else { if ($part['type'] == 'invalidLink') { $model = Kwc_Abstract::createModel($classes['link']); $destRow = $this->_getChildComponentRow('link', $model); if (isset($part['componentId'])) { try { $srcRow = $model->getRow($part['componentId']); } catch (Kwf_Exception $e) { $srcRow = false; } if (is_instance_of($classes['link'], 'Kwc_Basic_LinkTag_Component')) { $linkClasses = Kwc_Abstract::getChildComponentClasses($classes['link'], 'child'); if ($srcRow && class_exists($linkClasses[$srcRow->component])) { $linkModel = Kwc_Abstract::createModel($linkClasses[$srcRow->component]); $srcLinkRow = $linkModel->getRow($part['componentId'] . '-child'); if ($srcLinkRow) { $destRow->component = $srcRow->component; $destRow->save(); $destLinkRow = $linkModel->getRow($destRow->component_id . '-child'); if (!$destLinkRow) { $destLinkRow = $linkModel->createRow(); $destLinkRow->component_id = $destRow->component_id . '-child'; } foreach ($srcLinkRow->toArray() as $k => $i) { if ($k != 'component_id') { $destLinkRow->{$k} = $i; } } $destLinkRow->save(); $newContent .= "<a href=\"{$destRow->component_id}\">"; continue; } } } else { if (is_instance_of($classes['link'], 'Kwc_Basic_LinkTag_Abstract_Component')) { if ($srcRow) { foreach ($srcRow->toArray() as $k => $i) { if ($k != 'component_id') { $destRow->{$k} = $i; } } $destRow->save(); $newContent .= "<a href=\"{$destRow->component_id}\">"; continue; } } else { //Kein link möglich continue; } } } if (!$destRow) { $destRow = $model->createRow(); $this->addChildComponentRow('link', $destRow); } if (is_instance_of($classes['link'], 'Kwc_Basic_LinkTag_Component')) { $linkClasses = Kwc_Abstract::getChildComponentClasses($classes['link'], 'child'); $destRow->component = null; if (preg_match('#^mailto:#', $part['href'], $m)) { if (isset($linkClasses['mail']) && $linkClasses['mail']) { $destRow->component = 'mail'; } } else { if (isset($linkClasses['intern']) && $linkClasses['intern'] && Kwf_Config::getValue('server.redirectToDomain')) { $url = $part['href']; $parsedUrl = parse_url($url); if (!isset($parsedUrl['host'])) { if (isset($_SERVER['HTTP_HOST'])) { $url = 'http://' . $_SERVER['HTTP_HOST'] . $url; } else { $url = 'http://' . Kwf_Registry::get('config')->server->domain . $url; } } $internLinkPage = Kwf_Component_Data_Root::getInstance()->getPageByUrl($url, null); if ($internLinkPage) { $destRow->component = 'intern'; } } if (!$destRow->component && isset($linkClasses['extern']) && $linkClasses['extern']) { $destRow->component = 'extern'; } } if (!$destRow->component) { continue; } //kein solcher-link möglich $destRow->save(); $destClasses = Kwc_Abstract::getChildComponentClasses($classes['link'], 'child'); $row = Kwc_Abstract::createModel($destClasses[$destRow->component])->getRow($destRow->component_id . '-child'); if (!$row) { $row = Kwc_Abstract::createModel($destClasses[$destRow->component])->createRow(); } $row->component_id = $destRow->component_id . '-child'; if ($destRow->component == 'extern') { $row->target = $part['href']; } else { if ($destRow->component == 'intern') { $row->target = $internLinkPage->dbId; } else { preg_match('#^mailto:(.*)\\??(.*)#', $part['href'], $m); $row->mail = $m[1]; $m = parse_str($m[2]); $row->subject = isset($m['subject']) ? $m['subject'] : ''; $row->text = isset($m['body']) ? $m['body'] : ''; } } $row->save(); } else { if (is_instance_of($classes['link'], 'Kwc_Basic_LinkTag_Extern_Component')) { $destRow->target = $part['href']; $destRow->save(); } else { //Kein link möglich continue; } } $newContent .= "<a href=\"{$destRow->component_id}\">"; } else { if ($part['type'] == 'invalidDownload') { $srcRow = Kwc_Abstract::createModel($classes['download'])->getRow($part['componentId']); if ($srcRow->fileExists()) { $destRow = Kwc_Abstract::createModel($classes['download'])->createRow($srcRow->toArray()); $this->addChildComponentRow('download', $destRow); $destRow->save(); $newContent .= "<a href=\"{$destRow->component_id}\">"; continue; } } else { if (is_string($part)) { $newContent .= $part; } else { $newContent .= $part['html']; } } } } } } return $newContent; }