function getUrlLocalPart($url) { $connectionPart = getUrlConnectionPart($url); return substr($url, strlen($connectionPart)); }
function showGenerationStep() { $docvertDir = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR; $disallowDocumentGeneration = getGlobalConfigItem('doNotAllowDocumentGeneration'); if ($disallowDocumentGeneration == 'true') { return $this->getThemeFragment('generation-disabled.htmlf'); } if (isset($_REQUEST['step'])) { switch ($_REQUEST['step']) { case '4': if (!isset($_REQUEST['pages'])) { webServiceError('&error-webpage-generation-no-pages;'); } $template = $this->getThemeFragment('generation-step4.htmlf'); $hiddenFormChosenPages = array(); $listItems = array(); foreach ($_REQUEST['pages'] as $page) { $listItems[] = "\n\t\t\t\t" . '<li>' . $page . '</li>'; $hiddenFormChosenPages[] = "\n\t\t\t\t" . '<input type="hidden" name="pages[]" value="' . $page . '"/>'; } $template = str_replace('{{page-order}}', implode($listItems), $template); $template = str_replace('{{hidden-form-chosen-pages}}', implode($hiddenFormChosenPages), $template); $generatorPipelines = glob($this->docvertRootDirectory . 'generator-pipeline' . DIRECTORY_SEPARATOR . '*'); $generatorPipelinesArray = array(); foreach ($generatorPipelines as $generatorPipeline) { $generatorName = basename($generatorPipeline); $generatorPipelinesArray[] = '<option value="' . $generatorName . '">' . $generatorName . '</option>'; } return str_replace('{{generator-pipelines}}', implode('', $generatorPipelinesArray), $template); case '3': $template = $this->getThemeFragment('generation-step3.htmlf'); $listItems = array(); foreach ($_REQUEST['pages'] as $page) { $listItems[] = "\n\t\t\t\t" . '<option value="' . $page . '">' . $page . '</option>'; } return str_replace('{{chosen-scrape-urls}}', implode($listItems), $template); case '2': if (!isset($_REQUEST['url'])) { webServiceError('&error-webpage-generation-url;'); } $originalUrl = $_REQUEST['url']; if (trim($originalUrl) == '') { webServiceError('&error-webpage-generation-no-url-given;'); } if (!stringStartsWith($originalUrl, 'http')) { $originalUrl = 'http://' . $originalUrl; } $originalUrl = str_replace(array("\n", "\r", "\t", " "), '', $originalUrl); include_once dirname(__FILE__) . '/http.php'; if (trim(getUrlLocalPart($originalUrl)) == '') { $originalUrl = followUrlRedirects($originalUrl . '/'); } else { $originalUrl = followUrlRedirects($originalUrl); } if ($originalUrl === false) { webServiceError('&error-webpage-cannot-get-url;', 500, array('url' => $originalUrl)); } $page = file_get_contents($originalUrl); $baseTagPattern = "/<base[^>]*?href=([^>]*?)>/is"; preg_match($baseTagPattern, $page, $matches); if (count($matches) > 0) { $originalUrl = trim($matches[1]); $originalUrl = substr($originalUrl, 1, strlen($originalUrl) - 2); } $url = $originalUrl; $connectionPart = getUrlConnectionPart($url); $getUrlLocalPart = getUrlLocalPart($url); $localPartDirectory = getUrlLocalPartDirectory($url); $links = array(); $matches = null; preg_match_all('/href="(.*?)"/', $page, $matches); $matches = $matches[1]; $urls = array(); $urls[$originalUrl] = 'value that does not matter'; foreach ($matches as $match) { $link = $match; if (stringStartsWith($link, '/')) { $link = $connectionPart . $link; } elseif (stringStartsWith($link, "http://") || stringStartsWith($link, "https://")) { } elseif (stringStartsWith($link, "mailto:")) { } else { $link = $connectionPart . resolveRelativeUrl($localPartDirectory . $link); } if (containsString($link, '#')) { $link = substringBefore($link, '#'); } if (stringEndsWith($link, '?')) { $link = substringBefore($link, '?'); } if (stringStartsWith($link, 'http')) { $fileExtension = substr($link, strrpos($link, '.') + 1); switch ($fileExtension) { case 'avi': case 'mov': case 'mpg': case 'css': case 'jpeg': case 'jpg': case 'gif': case 'png': case 'bmp': case 'apng': case 'tiff': case 'ico': case 'js': case 'gz': case 'tar': case 'zip': case 'bin': case 'sit': case 'mp3': case 'mp4': case 'wav': case 'swf': case 'fla': case 'rss': case 'atom': case 'pdf': case 'xls': case 'doc': case 'txt': case 'pps': break; default: $urls[$link] = 'value that does not matter'; } } } $urls = array_keys($urls); $mostLikelyUrls = array(); $possibleUrls = array(); $unlikelyUrls = array(); $numberOfSlashesInOriginalUrl = strlen($originalUrl) - strlen(str_replace('/', '', $originalUrl)); foreach ($urls as $url) { $url = followUrlRedirects($url); if (trim($url) != '') { $numberOfSlashesInUrl = strlen($url) - strlen(str_replace('/', '', $url)); if (stringStartsWith($url, $connectionPart . $localPartDirectory) && $numberOfSlashesInUrl == $numberOfSlashesInOriginalUrl) { $mostLikelyUrls[] = $url; } elseif (stringStartsWith($url, $connectionPart)) { $possibleUrls[] = $url; } else { $unlikelyUrls[] = $url; } } } asort($unlikelyUrls); $itemId = 0; foreach ($mostLikelyUrls as $url) { $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '" checked="checked"/><span class="title">' . $url . '</label></span></li>' . "\n"; $itemId++; } foreach ($possibleUrls as $url) { $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '"/><span class="title">' . $url . '</label></span></li>' . "\n"; $itemId++; } foreach ($unlikelyUrls as $url) { $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '"/><span class="title">' . $url . '</label></span></li>' . "\n"; $itemId++; } $step2Template = $this->getThemeFragment('generation-step2.htmlf'); $step2Template = str_replace('{{scrape-results}}', implode('', $links), $step2Template); $step2Template = str_replace('{{scrape-url}}', $url, $step2Template); return $step2Template; default: return $this->getThemeFragment('generation-step1.htmlf'); } } else { return $this->getThemeFragment('generation-step1.htmlf'); } }
function process($currentXml) { $extractImagesPath = $this->docvertTransformDirectory . 'extract-pages-html-images-and-links.xsl'; $htmlUrls = trim(xsltTransform($currentXml, $extractImagesPath)); $htmlUrlLines = explode("\n", $htmlUrls); $imageUrls = array(); foreach ($htmlUrlLines as $htmlUrlLine) { if (trim($htmlUrlLine) == '') { continue; } $urlLineParts = explode("\t", $htmlUrlLine); $urlType = $urlLineParts[0]; $baseUrl = $urlLineParts[1]; $possiblyRelativeUrl = $urlLineParts[2]; $fullUrl = ''; if (stringStartsWith($possiblyRelativeUrl, "http://") || stringStartsWith($possiblyRelativeUrl, "https://") || stringStartsWith($possiblyRelativeUrl, "mailto:")) { $fullUrl = $possiblyRelativeUrl; } else { $connectionPart = getUrlConnectionPart($baseUrl); $getUrlLocalPart = getUrlLocalPart($baseUrl); $localPartDirectory = getUrlLocalPartDirectory($baseUrl); if (stringStartsWith($possiblyRelativeUrl, '/')) { $fullUrl = $connectionPart . $possiblyRelativeUrl; } else { $relativePath = resolveRelativeUrl($localPartDirectory . $possiblyRelativeUrl); if (!stringStartsWith($relativePath, '/')) { $relativePath = '/' . $relativePath; } $fullUrl = $connectionPart . $relativePath; } } $missingImagePlaceholderImagePath = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . '404image.gif'; if (!file_exists($missingImagePlaceholderImagePath)) { webServiceError('&dynamic-error-process-downloadimagesandsetlinks-missing-placeholder;', 500, array('fourOhFourImagePath' => $fourOhFourImagePath)); } $fullUrl = html_entity_decode($fullUrl); switch ($urlType) { case 'image': $imageData = file_get_contents($fullUrl); if ($imageData == null) { $imageData = file_get_contents($missingImagePlaceholderImagePath); } $picturesDirectory = $this->contentDirectory . DIRECTORY_SEPARATOR . 'Pictures'; if (!file_exists($picturesDirectory)) { mkdir($picturesDirectory); } if (!function_exists('imagecreatefromstring')) { webServiceError('&error-process-downloadimagesandsetlinks-missing-gd;'); } $imageResource = imagecreatefromstring($imageData); if (!$imageResource) { $imageResource = imagecreatefromstring(file_get_contents($missingImagePlaceholderImagePath)); } $imageWidth = imagesx($imageResource); $imageHeight = imagesy($imageResource); $fileExtension = substr($fullUrl, strrpos($fullUrl, '.') + 1); switch ($fileExtension) { case 'jpg': case 'jpeg': case 'gif': case 'png': break; default: $fileExtension = 'jpg'; } $openDocumentPath = 'Pictures/' . md5($fullUrl) . '.' . $fileExtension; file_put_contents($this->contentDirectory . DIRECTORY_SEPARATOR . $openDocumentPath, $imageData); $imageUrlReplacement = $openDocumentPath . '" c:width="' . $imageWidth . '" c:height="' . $imageHeight; //FIXME: assumes image @src has double-quote and not single $currentXml = str_replace('"' . $possiblyRelativeUrl . '"', '"' . $imageUrlReplacement . '"', $currentXml); break; case 'link': $linkUrl = $urlLineParts[2]; //print '"'.$possiblyRelativeUrl.' vs '.$fullUrl.'<hr />'; $currentXml = str_replace('"' . $possiblyRelativeUrl . '"', '"' . htmlentities($fullUrl) . '"', $currentXml); break; } } return $currentXml; }