/**
  * Function that performs the action of updating the local links on the webpage.
  *
  * @param WebPage          $webPage
  * @param HTMLImportStages $stagesSettings
  * @param WPMetaConfigs    $meta
  * @param null             $html_post_lookup
  *
  * @return null
  */
 protected function performStage(WebPage $webPage, HTMLImportStages $stagesSettings, WPMetaConfigs &$meta, &$html_post_lookup = null)
 {
     $body = $meta->getPostContent();
     if (!is_null($html_post_lookup)) {
         $bodyXML = XMLHelper::getXMLObjectFromString($body);
         $filepath = dirname($meta->getSourcePath());
         $link_table = array();
         // get a list of all the links in the page and iterate through them
         $all_links = $bodyXML->xpath('//a[@href]');
         // TODO: encapsulate this in a function use XMLHelper::getAllHRefsFromHTML as a start
         if ($all_links) {
             foreach ($all_links as $link) {
                 // iterate the link's attributes to find the HREF value
                 foreach ($link->attributes() as $attribute => $value) {
                     $path = '' . $value;
                     if (0 == strcasecmp('href', $attribute)) {
                         // TODO: handle foo.html#rar
                         if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                             // TODO: need to handle foo.html without handling http://...
                             // only handle files that end in .html or .htm
                             if (preg_match('/\\.([hH][tT][mM][lL]?)$/', $path)) {
                                 // if the file the path links to has been imported already, then it will exist in the lookup and we can update the link, otherwise leave the link alone
                                 $fullpath = $webPage->getFullPath($path);
                                 if ($fullpath) {
                                     if (array_key_exists($fullpath, $html_post_lookup)) {
                                         $link_table[$path] = $fullpath;
                                     }
                                 } else {
                                     echo '<span>***could not update link ' . $path . '</span><br>';
                                 }
                             }
                         }
                     }
                 }
             }
         }
         // after building a list of all the links to update and what to update them to, we can do a change in the html file as a whole to catch all references
         foreach ($link_table as $link => $full_link) {
             $post_id = $html_post_lookup[$full_link];
             $post_link = get_permalink($post_id);
             echo 'Updating ' . $link . ' with ' . $post_link . '<br>';
             $search_str = '/(\\b[hH][rR][eE][fF]\\s*=\\s*")([\\b\\.\\/]*' . preg_quote($link, '/') . '\\b)(")/';
             $body = preg_replace($search_str, '$1' . preg_quote($post_link, '/') . '$3', $body);
         }
         $meta->setPostContent($body);
     }
 }
示例#2
0
 /**
  * Calling this function causes the website hierarchy to be built.  An index file to be used may be passed in by the caller or the function can be overridden and inherently know its own index file.
  *
  * The expectation is that the implementation will construct a LinkedTree hierarchy of the website from the index file(s).  The top level will always be an array of LinkedTrees, to handle the case where there is no central top node.
  *
  * TODO: decrease the coupling of the child class on the LinkedTree object.
  *
  * @param null|string $indexFile index file to build website hierarchy from
  *
  * @return null|void
  */
 public function buildHierarchyFromWebsiteIndex($indexFile = null)
 {
     $indexFileToUse = self::DEFAULT_INDEX_FILE_NAME;
     if (!is_null($indexFile)) {
         $indexFileToUse = $indexFile;
     }
     if (\html_import\XMLHelper::valid_xml_file($this->retriever->getFullFilePath($indexFileToUse))) {
         $indexContents = $this->retriever->retrieveFileContents($indexFileToUse);
         $doc = new \DOMDocument();
         $xmlValid = @$doc->loadXML($indexContents, LIBXML_NOBLANKS);
         // suppresses errors and handles in if/else
         if ($xmlValid) {
             $nodelist = $doc->childNodes;
             for ($i = 0; $i < $nodelist->length; $i++) {
                 $this->readInChildNode($nodelist->item($i));
             }
         } else {
             echo $indexFileToUse . " was not a valid XML file, error trying to load it.<br>";
         }
     } else {
         echo 'Cannot find file ' . $indexFileToUse . "<br>Current path is " . getcwd() . '<br>';
     }
 }
示例#3
0
 /**
  * Returns an array of all links destinations contained in the content of the webpage.
  * @return array
  */
 public function getAllLinks()
 {
     $content = $this->getContent();
     $contentAsXML = XMLHelper::getXMLObjectFromString($content);
     return XMLHelper::getAllHRefsFromHTML($contentAsXML);
 }
示例#4
0
 /**
  * Replaces all body tags with div dags.
  *
  * @param $body
  *
  * @return mixed
  */
 private function replaceBodyWithDivs($body)
 {
     $divBody = XMLHelper::renameTags($body, 'body', 'div');
     return $divBody;
 }
示例#5
0
 /**
  * Builds meta data based on a loaded WebPage, and HtmlImportSettings from the plugin.
  *
  * @param admin\HtmlImportSettings $globalSettings
  * @param WebPage                  $webPage
  * @param null                     $post_id
  * @param null                     $parent_page_id
  */
 public function buildConfig(admin\HtmlImportSettings $globalSettings, WebPage $webPage, $post_id = null, $parent_page_id = null)
 {
     if (!is_null($post_id)) {
         $this->loadFromPostID($post_id);
     }
     if (is_null($webPage)) {
         $file_as_xml_obj = null;
     } else {
         $file_as_xml_obj = XMLHelper::getXMLObjectFromString($webPage->getContent());
         if (!is_null($file_as_xml_obj)) {
             $this->setPostContent($file_as_xml_obj->body->asXML());
             $this->setPostTitle($this->getTitleFromTag($file_as_xml_obj));
         }
     }
     $this->setPostName($this->getPostTitle());
     $this->setPostStatus('publish');
     $this->setPostType('page');
     $this->setCommentStatus('closed');
     $this->setPingStatus('closed');
     $categoryIDs = null;
     $overrideSettings = $webPage->getSettings();
     if (!is_null($overrideSettings)) {
         $categoryIDs = $overrideSettings->getCategoryIds();
     }
     // TODO: need to determine if index can override by providing no categories, and what that means
     if (is_null($overrideSettings) || is_null($categoryIDs) || sizeof($categoryIDs) <= 0) {
         $category = $globalSettings->getCategories()->getValuesArray();
         $categoryIDs = null;
         if (!is_null($category) && is_array($category)) {
             foreach ($category as $index => $cat) {
                 $cat_id = get_cat_ID(trim($cat));
                 $categoryIDs[$index] = intval($cat_id);
             }
         }
     }
     $this->setPostCategory($categoryIDs);
     // TODO need a way to track the date and time of the original file
     //if ( ! is_null($source_file)) {
     //	$this->setPostDate( date( 'Y-m-d H:i:s', filemtime( $source_file ) ) );
     //} else {
     $this->setPostDate(null);
     //}
     if (!is_null($parent_page_id)) {
         $this->setPostParent($parent_page_id);
     }
     $order = $webPage->getOrderPosition();
     if (isset($order)) {
         $this->setMenuOrder($order);
     }
     $this->setPostAuthor(wp_get_current_user()->ID);
     // TODO: should be in the settings object
     $this->setPageTemplate($globalSettings->getTemplate()->getValue());
 }
示例#6
0
 /**
  * Performs the stage action of uploading media files and updating the WebPage accordingly.
  *
  * @param WebPage          $webPage
  * @param HTMLImportStages $stagesSettings
  * @param WPMetaConfigs    $meta
  * @param null             $media_lookup
  *
  * @return null
  */
 protected function performStage(WebPage $webPage, HTMLImportStages $stagesSettings, WPMetaConfigs &$meta, &$media_lookup = null)
 {
     $post_id = $meta->getPostId();
     $body = $meta->getPostContent();
     if (is_null($body) || strcmp('', $body) == 0) {
         echo '** the body for post ' . $post_id . ' was empty, no media to import.';
         return;
     }
     $media_table = array();
     $file_as_xml_obj = XMLHelper::getXMLObjectFromString($body);
     // import img srcs
     $all_imgs = $file_as_xml_obj->xpath('//img[@src]');
     if ($all_imgs) {
         foreach ($all_imgs as $img) {
             foreach ($img->attributes() as $attribute => $value) {
                 $path = '' . $value;
                 if (0 == strcasecmp('src', $attribute)) {
                     // TODO: this is duplicated below, refactor it out
                     if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                         // if it's local
                         if (!is_null($media_lookup) && !array_key_exists($path, $media_table)) {
                             $fullpath = $webPage->getFullPath($path);
                             if (array_key_exists($fullpath, $media_lookup)) {
                                 $attach_id = $media_lookup[$fullpath];
                                 require_once ABSPATH . 'wp-admin/includes/image.php';
                                 $attach_data = wp_get_attachment_metadata($attach_id);
                                 wp_update_attachment_metadata($attach_id, $attach_data);
                                 $media_table[$path] = $fullpath;
                             } else {
                                 $filename = basename($fullpath);
                                 $upload = wp_upload_bits($filename, null, $webPage->getLinkContents($path));
                                 if ($upload['error']) {
                                     echo '<li>***Unable to upload media file ' . $filename . '</li>';
                                 } else {
                                     echo '<li>' . $filename . ' media file uploaded.</li>';
                                     $wp_filetype = wp_check_filetype(basename($upload['file']), null);
                                     $attachment = array('guid' => $upload['file'], 'post_mime_type' => $wp_filetype['type'], 'post_title' => preg_replace('/\\.[^.]+$/', '', basename($upload['file'])), 'post_content' => '', 'post_status' => 'inherit');
                                     $attach_id = wp_insert_attachment($attachment, $upload['file'], $post_id);
                                     require_once ABSPATH . 'wp-admin/includes/image.php';
                                     $attach_data = wp_generate_attachment_metadata($attach_id, $upload['file']);
                                     wp_update_attachment_metadata($attach_id, $attach_data);
                                     $media_lookup[$fullpath] = $attach_id;
                                     $media_table[$path] = $fullpath;
                                     echo '<li>' . $filename . ' attached to post ' . $post_id . '</li>';
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     // linked media
     $all_links = $file_as_xml_obj->xpath('//a[@href]');
     // TODO: encapsulate this in a function
     if ($all_links) {
         foreach ($all_links as $link) {
             foreach ($link->attributes() as $attribute => $value) {
                 $path = '' . $value;
                 if (0 == strcasecmp('href', $attribute)) {
                     if (!preg_match('/^[a-zA-Z].*:.*/', $path)) {
                         if (preg_match('/\\.(png|bmp|jpg|jpeg|gif|pdf|doc|docx|mp3|ogg|wav)$/', strtolower($path))) {
                             // media png,bmp,jpg,jpeg,gif,pdf,doc,docx,mp3,ogg,wav
                             if (!is_null($media_lookup)) {
                                 /*if ( $path[0] != '/' ) {
                                 			$fullpath = realpath( dirname( $meta->getSourcePath() ) . '/' . $path );
                                 		} else {
                                 			$fullpath = $path;
                                 		}*/
                                 $fullpath = $webPage->getFullPath($path);
                                 if (array_key_exists($fullpath, $media_lookup)) {
                                     $attach_id = $media_lookup[$fullpath];
                                     require_once ABSPATH . 'wp-admin/includes/image.php';
                                     $attach_data = wp_get_attachment_metadata($attach_id);
                                     wp_update_attachment_metadata($attach_id, $attach_data);
                                     $media_table[$path] = $fullpath;
                                 } else {
                                     $filename = basename($fullpath);
                                     $upload = wp_upload_bits($filename, null, $webPage->getLinkContents($path));
                                     if ($upload['error']) {
                                         echo '<li>***Unable to upload media file ' . $filename . '</li>';
                                     } else {
                                         echo '<li>' . $filename . ' media file uploaded.</li>';
                                         $wp_filetype = wp_check_filetype(basename($upload['file']), null);
                                         $attachment = array('guid' => $upload['file'], 'post_mime_type' => $wp_filetype['type'], 'post_title' => preg_replace('/\\.[^.]+$/', '', basename($upload['file'])), 'post_content' => '', 'post_status' => 'inherit');
                                         $attach_id = wp_insert_attachment($attachment, $upload['file'], $post_id);
                                         require_once ABSPATH . 'wp-admin/includes/image.php';
                                         $attach_data = wp_generate_attachment_metadata($attach_id, $upload['file']);
                                         wp_update_attachment_metadata($attach_id, $attach_data);
                                         $media_lookup[$fullpath] = $attach_id;
                                         $media_table[$path] = $fullpath;
                                         echo '<li>' . $filename . ' attached to post ' . $post_id . '</li>';
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     foreach ($media_table as $media_item => $full_media_path) {
         $media_id = $media_lookup[$full_media_path];
         $media_url = wp_get_attachment_url($media_id);
         $search_str = '/(\\b[iI][mM][gG]\\s*[^>]*\\s+[sS][rR][cC]\\s*=\\s*")([\\b\\/\\.]*' . preg_quote($media_item, '/') . '\\b)(")/';
         $body = preg_replace($search_str, '$1' . preg_quote($media_url, '/') . '$3', $body);
         // img src
         $body = preg_replace('/(\\b[hH][rR][eE][fF]\\s*=\\s*")(\\b' . preg_quote($media_item, '/') . '\\b)(")/', '$1' . preg_quote($media_url, '/') . '$3', $body);
         // a href
     }
     $meta->setPostContent($body);
     echo '<li>Post ' . $post_id . ' updated with correct image links.</li>';
 }
 /**
  * Retrieve the contents file, if provided using the relativePath to the base path used for the class.
  * Returns the contents as as string.
  * Assumes the string is a text based file.
  *
  * @param string $file
  * @param string $relativePath
  *
  * @return string
  */
 public function retrieveFileContents($file, $relativePath = '')
 {
     $fullPath = $this->buildFullPath($file, $relativePath);
     if (filter_var($fullPath, FILTER_VALIDATE_URL)) {
         // if URL
         if (strpos($fullPath, 'http://') == 0 || strpos($fullPath, 'https://') == 0) {
             $realPath = $fullPath;
             if (\html_import\XMLHelper::url_exists($realPath)) {
                 $file_get_success = file_get_contents($realPath);
                 if ($file_get_success === false) {
                     echo '*** ' . $relativePath . ' could not be read, may be non-existent or 0 length.';
                     return null;
                 } else {
                     return $file_get_success;
                 }
             } else {
                 return null;
             }
         } else {
             echo '*** ' . $fullPath . ' is not an HTTP or HTTP URL.';
             return null;
         }
     } else {
         // else if local directory
         $realPath = realpath($fullPath);
         if ($realPath !== false) {
             $file_get_success = file_get_contents($realPath);
             if ($file_get_success === false) {
                 echo '*** ' . $relativePath . ' could not be read, may be non-existent or 0 length.';
                 return null;
             } else {
                 return $file_get_success;
             }
         } else {
             return null;
         }
     }
 }