Example #1
0
/**
 * Extract the WordProcessingML XML files from the .docx file, and use a sequence of XSLT
 * steps to convert it into XHTML
 *
 * @param string $filename name of file uploaded to file repository as a draft
 * @param int $usercontextid ID of draft file area where images should be stored
 * @param int $draftitemid ID of particular group in draft file area where images should be stored
 * @return string XHTML content extracted from Word file
 */
function atto_wordimport_convert_to_xhtml($filename, $usercontextid, $draftitemid)
{
    global $CFG, $USER;
    $word2xmlstylesheet1 = __DIR__ . "/wordml2xhtmlpass1.xsl";
    // Convert WordML into basic XHTML.
    $word2xmlstylesheet2 = __DIR__ . "/wordml2xhtmlpass2.xsl";
    // Refine basic XHTML into Word-compatible XHTML.
    // @codingStandardsIgnoreLine debugging(__FUNCTION__ . ":" . __LINE__ . ": filename = \"{$filename}\"", DEBUG_WORDIMPORT);
    // Check that we can unzip the Word .docx file into its component files.
    $zipres = zip_open($filename);
    if (!is_resource($zipres)) {
        // Cannot unzip file.
        atto_wordimport_debug_unlink($filename);
        throw new moodle_exception('cannotunzipfile', 'error');
    }
    // Check that XSLT is installed.
    if (!class_exists('XSLTProcessor') || !function_exists('xslt_create')) {
        // PHP extension 'xsl' is required for this action.
        throw new moodle_exception(get_string('extensionrequired', 'tool_xmldb', 'xsl'));
    }
    // Give XSLT as much memory as possible, to enable larger Word files to be imported.
    raise_memory_limit(MEMORY_HUGE);
    if (!file_exists($word2xmlstylesheet1)) {
        // XSLT stylesheet to transform WordML into XHTML is missing.
        throw new moodle_exception('filemissing', 'moodle', $word2xmlstylesheet1);
    }
    // Set common parameters for all XSLT transformations.
    $parameters = array('moodle_language' => current_language(), 'moodle_textdirection' => right_to_left() ? 'rtl' : 'ltr', 'heading1stylelevel' => get_config('atto_wordimport', 'heading1stylelevel'), 'pluginname' => 'atto_wordimport', 'debug_flag' => DEBUG_WORDIMPORT);
    // Pre-XSLT preparation: merge the WordML and image content from the .docx Word file into one large XML file.
    // Initialise an XML string to use as a wrapper around all the XML files.
    $xmldeclaration = '<?xml version="1.0" encoding="UTF-8"?>';
    $wordmldata = $xmldeclaration . "\n<pass1Container>\n";
    $imagestring = "";
    $fs = get_file_storage();
    // Prepare filerecord array for creating each new image file.
    $fileinfo = array('contextid' => $usercontextid, 'component' => 'user', 'filearea' => 'draft', 'userid' => $USER->id, 'itemid' => $draftitemid, 'filepath' => '/', 'filename' => '');
    $zipentry = zip_read($zipres);
    while ($zipentry) {
        if (!zip_entry_open($zipres, $zipentry, "r")) {
            // Can't read the XML file from the Word .docx file.
            zip_close($zipres);
            throw new moodle_exception('errorunzippingfiles', 'error');
        }
        $zefilename = zip_entry_name($zipentry);
        $zefilesize = zip_entry_filesize($zipentry);
        // Insert internal images into the files table.
        if (strpos($zefilename, "media")) {
            // @codingStandardsIgnoreLine $imageformat = substr($zefilename, strrpos($zefilename, ".") + 1);
            $imagedata = zip_entry_read($zipentry, $zefilesize);
            $imagename = basename($zefilename);
            $imagesuffix = strtolower(substr(strrchr($zefilename, "."), 1));
            // GIF, PNG, JPG and JPEG handled OK, but bmp and other non-Internet formats are not.
            if ($imagesuffix == 'gif' or $imagesuffix == 'png' or $imagesuffix == 'jpg' or $imagesuffix == 'jpeg') {
                // Prepare the file details for storage, ensuring the image name is unique.
                $imagenameunique = $imagename;
                $file = $fs->get_file($usercontextid, 'user', 'draft', $draftitemid, '/', $imagenameunique);
                while ($file) {
                    $imagenameunique = basename($imagename, '.' . $imagesuffix) . '_' . substr(uniqid(), 8, 4) . '.' . $imagesuffix;
                    $file = $fs->get_file($usercontextid, 'user', 'draft', $draftitemid, '/', $imagenameunique);
                }
                $fileinfo['filename'] = $imagenameunique;
                $fs->create_file_from_string($fileinfo, $imagedata);
                $imageurl = "{$CFG->wwwroot}/draftfile.php/{$usercontextid}/user/draft/{$draftitemid}/{$imagenameunique}";
                // Return all the details of where the file is stored, even though we don't need them at the moment.
                $imagestring .= "<file filename=\"media/{$imagename}\"";
                $imagestring .= " contextid=\"{$usercontextid}\" itemid=\"{$draftitemid}\"";
                $imagestring .= " name=\"{$imagenameunique}\" url=\"{$imageurl}\">{$imageurl}</file>\n";
                // @codingStandardsIgnoreLine } else {
                // @codingStandardsIgnoreLine debugging(__FUNCTION__ . ":" . __LINE__ . ": ignore unsupported media file $zefilename" .
                // @codingStandardsIgnoreLine     " = $imagename, imagesuffix = $imagesuffix", DEBUG_WORDIMPORT);
            }
        } else {
            // Look for required XML files, read and wrap it, remove the XML declaration, and add it to the XML string.
            // Read and wrap XML files, remove the XML declaration, and add them to the XML string.
            $xmlfiledata = preg_replace('/<\\?xml version="1.0" ([^>]*)>/', "", zip_entry_read($zipentry, $zefilesize));
            switch ($zefilename) {
                case "word/document.xml":
                    $wordmldata .= "<wordmlContainer>" . $xmlfiledata . "</wordmlContainer>\n";
                    break;
                case "docProps/core.xml":
                    $wordmldata .= "<dublinCore>" . $xmlfiledata . "</dublinCore>\n";
                    break;
                case "docProps/custom.xml":
                    $wordmldata .= "<customProps>" . $xmlfiledata . "</customProps>\n";
                    break;
                case "word/styles.xml":
                    $wordmldata .= "<styleMap>" . $xmlfiledata . "</styleMap>\n";
                    break;
                case "word/_rels/document.xml.rels":
                    $wordmldata .= "<documentLinks>" . $xmlfiledata . "</documentLinks>\n";
                    break;
                case "word/footnotes.xml":
                    $wordmldata .= "<footnotesContainer>" . $xmlfiledata . "</footnotesContainer>\n";
                    break;
                case "word/_rels/footnotes.xml.rels":
                    $wordmldata .= "<footnoteLinks>" . $xmlfiledata . "</footnoteLinks>\n";
                    break;
                    /* @codingStandardsIgnoreStart
                       case "word/_rels/settings.xml.rels":
                           $wordmldata .= "<settingsLinks>" . $xmlfiledata . "</settingsLinks>\n";
                           break;
                           @codingStandardsIgnoreEnd
                       */
                /* @codingStandardsIgnoreStart
                   case "word/_rels/settings.xml.rels":
                       $wordmldata .= "<settingsLinks>" . $xmlfiledata . "</settingsLinks>\n";
                       break;
                       @codingStandardsIgnoreEnd
                   */
                default:
                    // @codingStandardsIgnoreLine debugging(__FUNCTION__ . ":" . __LINE__ . ": Ignore $zefilename", DEBUG_WORDIMPORT);
            }
        }
        // Get the next file in the Zip package.
        $zipentry = zip_read($zipres);
    }
    // End while loop.
    zip_close($zipres);
    // Add images section.
    $wordmldata .= "<imagesContainer>\n" . $imagestring . "</imagesContainer>\n";
    // Close the merged XML file.
    $wordmldata .= "</pass1Container>";
    // Pass 1 - convert WordML into linear XHTML.
    // Create a temporary file to store the merged WordML XML content to transform.
    $tempwordmlfilename = $CFG->dataroot . '/temp/' . basename($filename, ".tmp") . ".wml";
    if (file_put_contents($tempwordmlfilename, $wordmldata) === 0) {
        // Cannot save the file.
        throw new moodle_exception('cannotsavefile', 'error', $tempwordmlfilename);
    }
    $xsltproc = xslt_create();
    if (!($xsltoutput = xslt_process($xsltproc, $tempwordmlfilename, $word2xmlstylesheet1, null, null, $parameters))) {
        // Transformation failed.
        atto_wordimport_debug_unlink($tempwordmlfilename);
        throw new moodle_exception('transformationfailed', 'atto_wordimport', $tempwordmlfilename);
    }
    atto_wordimport_debug_unlink($tempwordmlfilename);
    // @codingStandardsIgnoreLine debugging(__FUNCTION__ . ":" . __LINE__ . ": Import XSLT Pass 1 succeeded, XHTML output fragment = " .
    // @codingStandardsIgnoreLine     str_replace("\n", "", substr($xsltoutput, 0, 200)), DEBUG_WORDIMPORT);
    // Write output of Pass 1 to a temporary file, for use in Pass 2.
    $tempxhtmlfilename = $CFG->dataroot . '/temp/' . basename($filename, ".tmp") . ".if1";
    $xsltoutput = str_replace('<p xmlns="http://www.w3.org/1999/xhtml"', '<p', $xsltoutput);
    $xsltoutput = str_replace('<span xmlns="http://www.w3.org/1999/xhtml"', '<span', $xsltoutput);
    $xsltoutput = str_replace(' xmlns=""', '', $xsltoutput);
    if (file_put_contents($tempxhtmlfilename, $xsltoutput) === 0) {
        // Cannot save the file.
        throw new moodle_exception('cannotsavefile', 'error', $tempxhtmlfilename);
    }
    // Pass 2 - tidy up linear XHTML a bit.
    if (!($xsltoutput = xslt_process($xsltproc, $tempxhtmlfilename, $word2xmlstylesheet2, null, null, $parameters))) {
        // Transformation failed.
        atto_wordimport_debug_unlink($tempxhtmlfilename);
        throw new moodle_exception('transformationfailed', 'atto_wordimport', $tempxhtmlfilename);
    }
    atto_wordimport_debug_unlink($tempxhtmlfilename);
    // Strip out superfluous namespace declarations on paragraph elements, which Moodle 2.7+ on Windows seems to throw in.
    $xsltoutput = str_replace('<p xmlns="http://www.w3.org/1999/xhtml"', '<p', $xsltoutput);
    $xsltoutput = str_replace('<span xmlns="http://www.w3.org/1999/xhtml"', '<span', $xsltoutput);
    $xsltoutput = str_replace(' xmlns=""', '', $xsltoutput);
    // Remove 'mml:' prefix from child MathML element and attributes for compatibility with MathJax.
    $xsltoutput = str_replace('<mml:', '<', $xsltoutput);
    $xsltoutput = str_replace('</mml:', '</', $xsltoutput);
    $xsltoutput = str_replace(' mathvariant="normal"', '', $xsltoutput);
    $xsltoutput = str_replace(' xmlns:mml="http://www.w3.org/1998/Math/MathML"', '', $xsltoutput);
    $xsltoutput = str_replace('<math>', '<math xmlns="http://www.w3.org/1998/Math/MathML">', $xsltoutput);
    // @codingStandardsIgnoreLine debugging(__FUNCTION__ . ":" . __LINE__ . ": Import XSLT Pass 2 succeeded, output = " .
    // @codingStandardsIgnoreLine     str_replace("\n", "", substr($xsltoutput, 500, 2000)), DEBUG_WORDIMPORT);
    // Keep the converted XHTML file for debugging if developer debugging enabled.
    if (DEBUG_WORDIMPORT == DEBUG_DEVELOPER and debugging(null, DEBUG_DEVELOPER)) {
        $tempxhtmlfilename = $CFG->dataroot . '/temp/' . basename($filename, ".tmp") . ".xhtml";
        file_put_contents($tempxhtmlfilename, $xsltoutput);
    }
    return $xsltoutput;
}
$fs = get_file_storage();
$usercontext = context_user::instance($USER->id);
if (!($file = $fs->get_file($usercontext->id, 'user', 'draft', $itemid, '/', basename($filename)))) {
    // File is not readable.
    throw new moodle_exception(get_string('errorreadingfile', 'error', basename($filename)));
}
// Save the uploaded file to a folder so we can process it using the PHP Zip library.
if (!($tmpfilename = $file->copy_content_to_temp())) {
    // Cannot save file.
    throw new moodle_exception(get_string('errorcreatingfile', 'error', basename($filename)));
} else {
    // Delete it from the draft file area to avoid possible name-clash messages if it is re-uploaded in the same edit.
    $file->delete();
}
// Convert the Word file into XHTML, store any images, and delete it once we're finished.
$htmltext = atto_wordimport_convert_to_xhtml($tmpfilename, $usercontext->id, $itemid);
atto_wordimport_debug_unlink($tmpfilename);
if (!$htmltext) {
    // Error processing upload file.
    throw new moodle_exception(get_string('cannotuploadfile', 'error'));
}
// Get the body content only, ignoring any metadata in the head.
$bodytext = atto_wordimport_get_html_body($htmltext);
// Convert the string to JSON-encoded format.
$htmltextjson = json_encode($bodytext);
if ($htmltextjson) {
    echo '{"html": ' . $htmltextjson . '}';
} else {
    // Invalid JSON string.
    throw new moodle_exception(get_string('invalidjson', 'repository'));
}