Example #1
0
 /**
  * Performs the import of the current webPage object as a folder.
  *
  * @param WebPage       $webPage
  * @param WPMetaConfigs $meta
  * @param Array|null    $html_post_lookup
  * @param Array|null    $media_lookup
  *
  * @return null
  */
 protected function doImport(WebPage $webPage, WPMetaConfigs $meta, &$html_post_lookup = null, &$media_lookup = null)
 {
     $updateResult = $meta->updateWPPost();
     if (is_wp_error($updateResult)) {
         echo '<li>***Unable to create folder ' . $meta->getPostTitle() . ' from ' . $meta->getSourcePath() . '</li>';
     } else {
         $webPage->setWPID($updateResult);
         echo '<li>Folder created from ' . $meta->getPostTitle() . ' into post #' . $updateResult . ' with title ' . $meta->getPostTitle() . '</li>';
     }
 }
 /**
  * Function that performs the action of updating the local links on the webpage.
  *
  * @param WebPage          $webPage
  * @param HTMLImportStages $stagesSettings
  * @param WPMetaConfigs    $meta
  * @param null             $html_post_lookup
  *
  * @return null
  */
 protected function performStage(WebPage $webPage, HTMLImportStages $stagesSettings, WPMetaConfigs &$meta, &$html_post_lookup = null)
 {
     $body = $meta->getPostContent();
     if (!is_null($html_post_lookup)) {
         $bodyXML = XMLHelper::getXMLObjectFromString($body);
         $filepath = dirname($meta->getSourcePath());
         $link_table = array();
         // get a list of all the links in the page and iterate through them
         $all_links = $bodyXML->xpath('//a[@href]');
         // TODO: encapsulate this in a function use XMLHelper::getAllHRefsFromHTML as a start
         if ($all_links) {
             foreach ($all_links as $link) {
                 // iterate the link's attributes to find the HREF value
                 foreach ($link->attributes() as $attribute => $value) {
                     $path = '' . $value;
                     if (0 == strcasecmp('href', $attribute)) {
                         // TODO: handle foo.html#rar
                         if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                             // TODO: need to handle foo.html without handling http://...
                             // only handle files that end in .html or .htm
                             if (preg_match('/\\.([hH][tT][mM][lL]?)$/', $path)) {
                                 // if the file the path links to has been imported already, then it will exist in the lookup and we can update the link, otherwise leave the link alone
                                 $fullpath = $webPage->getFullPath($path);
                                 if ($fullpath) {
                                     if (array_key_exists($fullpath, $html_post_lookup)) {
                                         $link_table[$path] = $fullpath;
                                     }
                                 } else {
                                     echo '<span>***could not update link ' . $path . '</span><br>';
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // after building a list of all the links to update and what to update them to, we can do a change in the html file as a whole to catch all references
         foreach ($link_table as $link => $full_link) {
             $post_id = $html_post_lookup[$full_link];
             $post_link = get_permalink($post_id);
             echo 'Updating ' . $link . ' with ' . $post_link . '<br>';
             $search_str = '/(\\b[hH][rR][eE][fF]\\s*=\\s*")([\\b\\.\\/]*' . preg_quote($link, '/') . '\\b)(")/';
             $body = preg_replace($search_str, '$1' . preg_quote($post_link, '/') . '$3', $body);
         }
         $meta->setPostContent($body);
     }
 }
Example #3
0
 /**
  * Build a LinkedTree tree from the file ordering and the file listing derived from the Flare index files.
  * The HTML file information is stored in the payload of the LinkedTree with WebPage object representing the file.
  *
  * @param Array       $fileOrder  array based on Toc.js
  * @param Array       $fileList   array based on Toc_Chunk0.js
  * @param  LinkedTree $parentNode the base node to attach the hierarchy to
  *
  * @return LinkedTree|null
  */
 private function buildTree(array $fileOrder, array $fileList, LinkedTree $parentNode = null)
 {
     $firstNode = null;
     $counter = 0;
     foreach ($fileOrder as $item) {
         $itemIndex = $item['i'];
         $pagePath = $fileList[$itemIndex]['path'];
         $pageTitle = $fileList[$itemIndex]['title'];
         $node = new WebPage($this->retriever, $pageTitle, $pagePath);
         $node->setOrderPosition($itemIndex);
         if (!is_null($parentNode)) {
             $parentNode->addChild($node);
         } else {
             $this->trees[] = $node;
         }
         if ($counter == 0) {
             $firstNode = $node;
         }
         if (array_key_exists('n', $item)) {
             $itemChildren = $item['n'];
             $this->buildTree($itemChildren, $fileList, $node);
         }
         $counter++;
     }
     return $firstNode;
 }
Example #4
0
 /**
  * Imports a WebPage object into Wordpress, using the provided HtMLImportSettings, and assigning it to be a child of the post with the id defined in $parent_page_id.  $html_post_lookup is used to determine if the page had already been created by this session's import.
  *
  * @param \html_import\indices\WebPage          $webPage
  * @param \html_import\admin\HtmlImportSettings $globalSettings
  * @param                                       $parent_page_id
  * @param                                       $html_post_lookup
  *
  * @return \html_import\WPMetaConfigs
  */
 private function importAnHTML(\html_import\indices\WebPage $webPage, html_import\admin\HtmlImportSettings $globalSettings, $parent_page_id, $html_post_lookup)
 {
     $title = $webPage->getTitle();
     $pageMeta = new \html_import\WPMetaConfigs();
     $post_id = null;
     // determine if the page has already been imported, search by post title
     $post = get_page_by_title(htmlspecialchars($title));
     // TODO: bad form, its saved with htmlspecialchars so need to search using that.  Need to find a way to not require this knowledge
     if (isset($html_post_lookup)) {
         // check to see if there's been an import of this page already, if so get its ID from the lookup
         if (array_key_exists($webPage->getFullPath(), $html_post_lookup)) {
             $post_id = $html_post_lookup[$webPage->getFullPath()];
         } else {
             // the post wasn't imported during this import, but a post already exists with its title.  Use it.
             if (!is_null($post)) {
                 $post_id = $post->ID;
                 echo '<li>Page with title ' . $title . ' and ID ' . $post_id . ' already exists, now tagged to be overwritten.</li>';
             }
         }
     }
     $pageMeta->buildConfig($globalSettings, $webPage, $post_id, $parent_page_id);
     if (!is_null($title)) {
         $pageMeta->setPostTitle($title);
     }
     return $pageMeta;
 }
Example #5
0
 /**
  * Performs the stage action of uploading media files and updating the WebPage accordingly.
  *
  * @param WebPage          $webPage
  * @param HTMLImportStages $stagesSettings
  * @param WPMetaConfigs    $meta
  * @param null             $media_lookup
  *
  * @return null
  */
 protected function performStage(WebPage $webPage, HTMLImportStages $stagesSettings, WPMetaConfigs &$meta, &$media_lookup = null)
 {
     $post_id = $meta->getPostId();
     $body = $meta->getPostContent();
     if (is_null($body) || strcmp('', $body) == 0) {
         echo '** the body for post ' . $post_id . ' was empty, no media to import.';
         return;
     }
     $media_table = array();
     $file_as_xml_obj = XMLHelper::getXMLObjectFromString($body);
     // import img srcs
     $all_imgs = $file_as_xml_obj->xpath('//img[@src]');
     if ($all_imgs) {
         foreach ($all_imgs as $img) {
             foreach ($img->attributes() as $attribute => $value) {
                 $path = '' . $value;
                 if (0 == strcasecmp('src', $attribute)) {
                     // TODO: this is duplicated below, refactor it out
                     if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                         // if it's local
                         if (!is_null($media_lookup) && !array_key_exists($path, $media_table)) {
                             $fullpath = $webPage->getFullPath($path);
                             if (array_key_exists($fullpath, $media_lookup)) {
                                 $attach_id = $media_lookup[$fullpath];
                                 require_once ABSPATH . 'wp-admin/includes/image.php';
                                 $attach_data = wp_get_attachment_metadata($attach_id);
                                 wp_update_attachment_metadata($attach_id, $attach_data);
                                 $media_table[$path] = $fullpath;
                             } else {
                                 $filename = basename($fullpath);
                                 $upload = wp_upload_bits($filename, null, $webPage->getLinkContents($path));
                                 if ($upload['error']) {
                                     echo '<li>***Unable to upload media file ' . $filename . '</li>';
                                 } else {
                                     echo '<li>' . $filename . ' media file uploaded.</li>';
                                     $wp_filetype = wp_check_filetype(basename($upload['file']), null);
                                     $attachment = array('guid' => $upload['file'], 'post_mime_type' => $wp_filetype['type'], 'post_title' => preg_replace('/\\.[^.]+$/', '', basename($upload['file'])), 'post_content' => '', 'post_status' => 'inherit');
                                     $attach_id = wp_insert_attachment($attachment, $upload['file'], $post_id);
                                     require_once ABSPATH . 'wp-admin/includes/image.php';
                                     $attach_data = wp_generate_attachment_metadata($attach_id, $upload['file']);
                                     wp_update_attachment_metadata($attach_id, $attach_data);
                                     $media_lookup[$fullpath] = $attach_id;
                                     $media_table[$path] = $fullpath;
                                     echo '<li>' . $filename . ' attached to post ' . $post_id . '</li>';
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     // linked media
     $all_links = $file_as_xml_obj->xpath('//a[@href]');
     // TODO: encapsulate this in a function
     if ($all_links) {
         foreach ($all_links as $link) {
             foreach ($link->attributes() as $attribute => $value) {
                 $path = '' . $value;
                 if (0 == strcasecmp('href', $attribute)) {
                     if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                         if (preg_match('/\\.(png|bmp|jpg|jpeg|gif|pdf|doc|docx|mp3|ogg|wav)$/', strtolower($path))) {
                             // media png,bmp,jpg,jpeg,gif,pdf,doc,docx,mp3,ogg,wav
                             if (!is_null($media_lookup)) {
                                 /*if ( $path[0] != '/' ) {
                                 			$fullpath = realpath( dirname( $meta->getSourcePath() ) . '/' . $path );
                                 		} else {
                                 			$fullpath = $path;
                                 		}*/
                                 $fullpath = $webPage->getFullPath($path);
                                 if (array_key_exists($fullpath, $media_lookup)) {
                                     $attach_id = $media_lookup[$fullpath];
                                     require_once ABSPATH . 'wp-admin/includes/image.php';
                                     $attach_data = wp_get_attachment_metadata($attach_id);
                                     wp_update_attachment_metadata($attach_id, $attach_data);
                                     $media_table[$path] = $fullpath;
                                 } else {
                                     $filename = basename($fullpath);
                                     $upload = wp_upload_bits($filename, null, $webPage->getLinkContents($path));
                                     if ($upload['error']) {
                                         echo '<li>***Unable to upload media file ' . $filename . '</li>';
                                     } else {
                                         echo '<li>' . $filename . ' media file uploaded.</li>';
                                         $wp_filetype = wp_check_filetype(basename($upload['file']), null);
                                         $attachment = array('guid' => $upload['file'], 'post_mime_type' => $wp_filetype['type'], 'post_title' => preg_replace('/\\.[^.]+$/', '', basename($upload['file'])), 'post_content' => '', 'post_status' => 'inherit');
                                         $attach_id = wp_insert_attachment($attachment, $upload['file'], $post_id);
                                         require_once ABSPATH . 'wp-admin/includes/image.php';
                                         $attach_data = wp_generate_attachment_metadata($attach_id, $upload['file']);
                                         wp_update_attachment_metadata($attach_id, $attach_data);
                                         $media_lookup[$fullpath] = $attach_id;
                                         $media_table[$path] = $fullpath;
                                         echo '<li>' . $filename . ' attached to post ' . $post_id . '</li>';
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     foreach ($media_table as $media_item => $full_media_path) {
         $media_id = $media_lookup[$full_media_path];
         $media_url = wp_get_attachment_url($media_id);
         $search_str = '/(\\b[iI][mM][gG]\\s*[^>]*\\s+[sS][rR][cC]\\s*=\\s*")([\\b\\/\\.]*' . preg_quote($media_item, '/') . '\\b)(")/';
         $body = preg_replace($search_str, '$1' . preg_quote($media_url, '/') . '$3', $body);
         // img src
         $body = preg_replace('/(\\b[hH][rR][eE][fF]\\s*=\\s*")(\\b' . preg_quote($media_item, '/') . '\\b)(")/', '$1' . preg_quote($media_url, '/') . '$3', $body);
         // a href
     }
     $meta->setPostContent($body);
     echo '<li>Post ' . $post_id . ' updated with correct image links.</li>';
 }