Beispiel #1
0
function getUrlLocalPart($url)
{
    $connectionPart = getUrlConnectionPart($url);
    return substr($url, strlen($connectionPart));
}
Beispiel #2
0
 function showGenerationStep()
 {
     $docvertDir = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR;
     $disallowDocumentGeneration = getGlobalConfigItem('doNotAllowDocumentGeneration');
     if ($disallowDocumentGeneration == 'true') {
         return $this->getThemeFragment('generation-disabled.htmlf');
     }
     if (isset($_REQUEST['step'])) {
         switch ($_REQUEST['step']) {
             case '4':
                 if (!isset($_REQUEST['pages'])) {
                     webServiceError('&error-webpage-generation-no-pages;');
                 }
                 $template = $this->getThemeFragment('generation-step4.htmlf');
                 $hiddenFormChosenPages = array();
                 $listItems = array();
                 foreach ($_REQUEST['pages'] as $page) {
                     $listItems[] = "\n\t\t\t\t" . '<li>' . $page . '</li>';
                     $hiddenFormChosenPages[] = "\n\t\t\t\t" . '<input type="hidden" name="pages[]" value="' . $page . '"/>';
                 }
                 $template = str_replace('{{page-order}}', implode($listItems), $template);
                 $template = str_replace('{{hidden-form-chosen-pages}}', implode($hiddenFormChosenPages), $template);
                 $generatorPipelines = glob($this->docvertRootDirectory . 'generator-pipeline' . DIRECTORY_SEPARATOR . '*');
                 $generatorPipelinesArray = array();
                 foreach ($generatorPipelines as $generatorPipeline) {
                     $generatorName = basename($generatorPipeline);
                     $generatorPipelinesArray[] = '<option value="' . $generatorName . '">' . $generatorName . '</option>';
                 }
                 return str_replace('{{generator-pipelines}}', implode('', $generatorPipelinesArray), $template);
             case '3':
                 $template = $this->getThemeFragment('generation-step3.htmlf');
                 $listItems = array();
                 foreach ($_REQUEST['pages'] as $page) {
                     $listItems[] = "\n\t\t\t\t" . '<option value="' . $page . '">' . $page . '</option>';
                 }
                 return str_replace('{{chosen-scrape-urls}}', implode($listItems), $template);
             case '2':
                 if (!isset($_REQUEST['url'])) {
                     webServiceError('&error-webpage-generation-url;');
                 }
                 $originalUrl = $_REQUEST['url'];
                 if (trim($originalUrl) == '') {
                     webServiceError('&error-webpage-generation-no-url-given;');
                 }
                 if (!stringStartsWith($originalUrl, 'http')) {
                     $originalUrl = 'http://' . $originalUrl;
                 }
                 $originalUrl = str_replace(array("\n", "\r", "\t", " "), '', $originalUrl);
                 include_once dirname(__FILE__) . '/http.php';
                 if (trim(getUrlLocalPart($originalUrl)) == '') {
                     $originalUrl = followUrlRedirects($originalUrl . '/');
                 } else {
                     $originalUrl = followUrlRedirects($originalUrl);
                 }
                 if ($originalUrl === false) {
                     webServiceError('&error-webpage-cannot-get-url;', 500, array('url' => $originalUrl));
                 }
                 $page = file_get_contents($originalUrl);
                 $baseTagPattern = "/<base[^>]*?href=([^>]*?)>/is";
                 preg_match($baseTagPattern, $page, $matches);
                 if (count($matches) > 0) {
                     $originalUrl = trim($matches[1]);
                     $originalUrl = substr($originalUrl, 1, strlen($originalUrl) - 2);
                 }
                 $url = $originalUrl;
                 $connectionPart = getUrlConnectionPart($url);
                 $getUrlLocalPart = getUrlLocalPart($url);
                 $localPartDirectory = getUrlLocalPartDirectory($url);
                 $links = array();
                 $matches = null;
                 preg_match_all('/href="(.*?)"/', $page, $matches);
                 $matches = $matches[1];
                 $urls = array();
                 $urls[$originalUrl] = 'value that does not matter';
                 foreach ($matches as $match) {
                     $link = $match;
                     if (stringStartsWith($link, '/')) {
                         $link = $connectionPart . $link;
                     } elseif (stringStartsWith($link, "http://") || stringStartsWith($link, "https://")) {
                     } elseif (stringStartsWith($link, "mailto:")) {
                     } else {
                         $link = $connectionPart . resolveRelativeUrl($localPartDirectory . $link);
                     }
                     if (containsString($link, '#')) {
                         $link = substringBefore($link, '#');
                     }
                     if (stringEndsWith($link, '?')) {
                         $link = substringBefore($link, '?');
                     }
                     if (stringStartsWith($link, 'http')) {
                         $fileExtension = substr($link, strrpos($link, '.') + 1);
                         switch ($fileExtension) {
                             case 'avi':
                             case 'mov':
                             case 'mpg':
                             case 'css':
                             case 'jpeg':
                             case 'jpg':
                             case 'gif':
                             case 'png':
                             case 'bmp':
                             case 'apng':
                             case 'tiff':
                             case 'ico':
                             case 'js':
                             case 'gz':
                             case 'tar':
                             case 'zip':
                             case 'bin':
                             case 'sit':
                             case 'mp3':
                             case 'mp4':
                             case 'wav':
                             case 'swf':
                             case 'fla':
                             case 'rss':
                             case 'atom':
                             case 'pdf':
                             case 'xls':
                             case 'doc':
                             case 'txt':
                             case 'pps':
                                 break;
                             default:
                                 $urls[$link] = 'value that does not matter';
                         }
                     }
                 }
                 $urls = array_keys($urls);
                 $mostLikelyUrls = array();
                 $possibleUrls = array();
                 $unlikelyUrls = array();
                 $numberOfSlashesInOriginalUrl = strlen($originalUrl) - strlen(str_replace('/', '', $originalUrl));
                 foreach ($urls as $url) {
                     $url = followUrlRedirects($url);
                     if (trim($url) != '') {
                         $numberOfSlashesInUrl = strlen($url) - strlen(str_replace('/', '', $url));
                         if (stringStartsWith($url, $connectionPart . $localPartDirectory) && $numberOfSlashesInUrl == $numberOfSlashesInOriginalUrl) {
                             $mostLikelyUrls[] = $url;
                         } elseif (stringStartsWith($url, $connectionPart)) {
                             $possibleUrls[] = $url;
                         } else {
                             $unlikelyUrls[] = $url;
                         }
                     }
                 }
                 asort($unlikelyUrls);
                 $itemId = 0;
                 foreach ($mostLikelyUrls as $url) {
                     $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '" checked="checked"/><span class="title">' . $url . '</label></span></li>' . "\n";
                     $itemId++;
                 }
                 foreach ($possibleUrls as $url) {
                     $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '"/><span class="title">' . $url . '</label></span></li>' . "\n";
                     $itemId++;
                 }
                 foreach ($unlikelyUrls as $url) {
                     $links[] = '<li class="orderingItem"><label for="urlId' . $itemId . '"><input type="checkbox" name="pages[]" value="' . $url . '" id="urlId' . $itemId . '"/><span class="title">' . $url . '</label></span></li>' . "\n";
                     $itemId++;
                 }
                 $step2Template = $this->getThemeFragment('generation-step2.htmlf');
                 $step2Template = str_replace('{{scrape-results}}', implode('', $links), $step2Template);
                 $step2Template = str_replace('{{scrape-url}}', $url, $step2Template);
                 return $step2Template;
             default:
                 return $this->getThemeFragment('generation-step1.htmlf');
         }
     } else {
         return $this->getThemeFragment('generation-step1.htmlf');
     }
 }
 function process($currentXml)
 {
     $extractImagesPath = $this->docvertTransformDirectory . 'extract-pages-html-images-and-links.xsl';
     $htmlUrls = trim(xsltTransform($currentXml, $extractImagesPath));
     $htmlUrlLines = explode("\n", $htmlUrls);
     $imageUrls = array();
     foreach ($htmlUrlLines as $htmlUrlLine) {
         if (trim($htmlUrlLine) == '') {
             continue;
         }
         $urlLineParts = explode("\t", $htmlUrlLine);
         $urlType = $urlLineParts[0];
         $baseUrl = $urlLineParts[1];
         $possiblyRelativeUrl = $urlLineParts[2];
         $fullUrl = '';
         if (stringStartsWith($possiblyRelativeUrl, "http://") || stringStartsWith($possiblyRelativeUrl, "https://") || stringStartsWith($possiblyRelativeUrl, "mailto:")) {
             $fullUrl = $possiblyRelativeUrl;
         } else {
             $connectionPart = getUrlConnectionPart($baseUrl);
             $getUrlLocalPart = getUrlLocalPart($baseUrl);
             $localPartDirectory = getUrlLocalPartDirectory($baseUrl);
             if (stringStartsWith($possiblyRelativeUrl, '/')) {
                 $fullUrl = $connectionPart . $possiblyRelativeUrl;
             } else {
                 $relativePath = resolveRelativeUrl($localPartDirectory . $possiblyRelativeUrl);
                 if (!stringStartsWith($relativePath, '/')) {
                     $relativePath = '/' . $relativePath;
                 }
                 $fullUrl = $connectionPart . $relativePath;
             }
         }
         $missingImagePlaceholderImagePath = dirname(dirname(__FILE__)) . DIRECTORY_SEPARATOR . 'files' . DIRECTORY_SEPARATOR . '404image.gif';
         if (!file_exists($missingImagePlaceholderImagePath)) {
             webServiceError('&dynamic-error-process-downloadimagesandsetlinks-missing-placeholder;', 500, array('fourOhFourImagePath' => $fourOhFourImagePath));
         }
         $fullUrl = html_entity_decode($fullUrl);
         switch ($urlType) {
             case 'image':
                 $imageData = file_get_contents($fullUrl);
                 if ($imageData == null) {
                     $imageData = file_get_contents($missingImagePlaceholderImagePath);
                 }
                 $picturesDirectory = $this->contentDirectory . DIRECTORY_SEPARATOR . 'Pictures';
                 if (!file_exists($picturesDirectory)) {
                     mkdir($picturesDirectory);
                 }
                 if (!function_exists('imagecreatefromstring')) {
                     webServiceError('&error-process-downloadimagesandsetlinks-missing-gd;');
                 }
                 $imageResource = imagecreatefromstring($imageData);
                 if (!$imageResource) {
                     $imageResource = imagecreatefromstring(file_get_contents($missingImagePlaceholderImagePath));
                 }
                 $imageWidth = imagesx($imageResource);
                 $imageHeight = imagesy($imageResource);
                 $fileExtension = substr($fullUrl, strrpos($fullUrl, '.') + 1);
                 switch ($fileExtension) {
                     case 'jpg':
                     case 'jpeg':
                     case 'gif':
                     case 'png':
                         break;
                     default:
                         $fileExtension = 'jpg';
                 }
                 $openDocumentPath = 'Pictures/' . md5($fullUrl) . '.' . $fileExtension;
                 file_put_contents($this->contentDirectory . DIRECTORY_SEPARATOR . $openDocumentPath, $imageData);
                 $imageUrlReplacement = $openDocumentPath . '" c:width="' . $imageWidth . '" c:height="' . $imageHeight;
                 //FIXME: assumes image @src has double-quote and not single
                 $currentXml = str_replace('"' . $possiblyRelativeUrl . '"', '"' . $imageUrlReplacement . '"', $currentXml);
                 break;
             case 'link':
                 $linkUrl = $urlLineParts[2];
                 //print '"'.$possiblyRelativeUrl.'  vs  '.$fullUrl.'<hr />';
                 $currentXml = str_replace('"' . $possiblyRelativeUrl . '"', '"' . htmlentities($fullUrl) . '"', $currentXml);
                 break;
         }
     }
     return $currentXml;
 }