/** * Used to extract the title, description and links from * a docx file consisting of xml data. * * @param string $page docx(zip) contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $sites = array(); $zip = new PartialZipArchive($page); $buf = $zip->getFromName("docProps/core.xml"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Try to get the title from the document meta data $summary[self::TITLE] = self::title($dom); } } $buf = $zip->getFromName("word/document.xml"); if ($buf) { $dom = self::dom($buf); $summary[self::DESCRIPTION] = self::docText($dom); $summary[self::LANG] = guessLocaleFromString($summary[self::DESCRIPTION], 'en-US'); } else { $summary[self::DESCRIPTION] = "Did not download " . "word/document.xml portion of docx file"; $summary[self::LANG] = 'en-US'; } $buf = $zip->getFromName("word/_rels/document.xml.rels"); if ($buf) { $dom = self::dom($buf); $summary[self::LINKS] = self::links($dom, $url); } else { $summary[self::LINKS] = array(); } return $summary; }
/** * Used to extract the title, description and links from * a pptx file consisting of xml data. * * @param string $page pptx(zip) contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; // Open zip archive $zip = new PartialZipArchive($page); $buf = $zip->getFromName("docProps/core.xml"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Get the title $summary[self::TITLE] = self::title($dom); } } $buf = $zip->getFromName("docProps/app.xml"); if ($buf) { // Get number of slides present $dom = self::dom($buf); $num_slides = self::numSlides($dom); } else { /* go for an upper bound on number of slides (might happen on partial download of pptx file) */ $num_slides = $zip->numFiles(); } $summary[self::DESCRIPTION] = ""; $summary[self::LINKS] = array(); $lang = NULL; for ($i = 1; $i <= $num_slides; $i++) { $buf = $zip->getFromName("ppt/slides/slide" . $i . ".xml"); if ($buf) { /* Get description , language and url links asociated with each slide*/ $dom = self::dom($buf); $description = self::slideText($dom); if (strlen($summary[self::DESCRIPTION]) + strlen($description) < self::$max_description_len) { $summary[self::DESCRIPTION] .= $description; } if (!$lang) { $lang = self::lang($dom); if ($lang) { $summary[self::LANG] = $lang; } } $summary[self::LINKS] = array_merge($summary[self::LINKS], self::links($dom, $url)); } } return $summary; }
/** * Used to extract the title, description and links from * a xlsx file. * * @param string $page contents of xlsx file in zip format * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; // Open a zip archive $zip = new PartialZipArchive($page); //Count of the sheets in xlsx $file_count = 0; //Getting the title from xlsx file $buf = $zip->getFromName("docProps/app.xml"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Get the title $summary[self::TITLE] = self::title($dom); $file_count = self::sheetCount($dom); } } //Getting the description from xlsx file $buf = $zip->getFromName("xl/sharedStrings.xml"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Get the description $summary[self::DESCRIPTION] = self::description($dom); } //Getting the language from xlsx file $summary[self::LANG] = self::calculateLang($summary[self::DESCRIPTION], $url); } $summary[self::LINKS] = array(); //Getting links from each worksheet for ($i = 1; $i <= $file_count; $i++) { $buf = $zip->getFromName("xl/worksheets/_rels/sheet" . $i . ".xml.rels"); if ($buf) { $dom = self::dom($buf); if ($dom !== false) { // Get the links $summary[self::LINKS] = array_merge($summary[self::LINKS], self::links($dom, $url)); } } } return $summary; }
/** * Used to extract the title, description and links from * a string consisting of ebook publication data. * * @param string $page epub contents * @param string $url the url where the page contents came from, * used to canonicalize relative links * * @return array a summary of the contents of the page * */ function process($page, $url) { $summary = NULL; $opf_pattern = "/.opf\$/i"; $html_pattern = "/.html\$/i"; $xhtml_pattern = "/.xhtml\$/i"; $epub_url[0] = ''; $epub_language = ''; $epub_title = ''; $epub_unique_identifier = ''; $epub_author = ''; $epub_publisher = ''; $epub_date = ''; $epub_subject = ''; $desc = ''; $htmlcontent = ''; // Open a zip archive $zip = new PartialZipArchive($page); $num_files = $zip->numFiles(); for ($i = 0; $i < $num_files; $i++) { // get the content file names of .epub document $filename[$i] = $zip->getNameIndex($i); if (preg_match($opf_pattern, $filename[$i])) { // Get the file data from zipped folder $opf_data = $zip->getFromName($filename[$i]); $opf_summary = $this->xmlToObject($opf_data); for ($m = 0; $m <= MAX_DOM_LEVEL; $m++) { for ($n = 0; $n <= MAX_DOM_LEVEL; $n++) { if (isset($opf_summary->children[$m]->children[$n])) { $child = $opf_summary->children[$m]->children[$n]; if (isset($child->name) && $child->name == "dc:language") { $epub_language = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:title") { $epub_title = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:creator") { $epub_author = $opf_summary->children[$m]->children[$n]->content; } if ($opf_summary->children[$m]->children[$n]->name == "dc:identifier") { $epub_unique_identifier = $opf_summary->children[$m]->children[$n]->content; } } } } } else { if (preg_match($html_pattern, $filename[$i]) || preg_match($xhtml_pattern, $filename[$i])) { $html = new HtmlProcessor(); $html_data = $zip->getFromName($filename[$i]); $description[$i] = $html->process($html_data, $url); $htmlcontent .= $description[$i]['t']; } } } if ($epub_title != '') { $desc = " {$epub_title} ."; } if ($epub_author != '') { $desc = $desc . " {$epub_author} "; } if ($epub_language != '') { $desc = $desc . " {$epub_language} "; } if ($epub_unique_identifier != '') { $desc = $desc . " URN-" . $epub_unique_identifier . "."; } if ($epub_publisher != '') { $desc = $desc . " {$epub_publisher} "; } if ($epub_date != '') { $desc = $desc . " {$epub_date} "; } if ($epub_subject != '') { $desc = $desc . " {$epub_subject} "; } $desc = $desc . $htmlcontent; //restrict the length of the description to maximum description length if (strlen($desc) > self::$max_description_len) { $desc = substr($desc, 0, self::$max_description_len); } $summary[self::TITLE] = $epub_title; $summary[self::DESCRIPTION] = $desc; $summary[self::LANG] = $epub_language; $summary[self::LINKS] = $epub_url; $summary[self::PAGE] = $page; return $summary; }