public function testDOMFindMethod() { $str = "\n\t\t\t<div id='main'>\n\t\t\t\t<div>\n\t\t\t\t\t<div>lala</div>\n\t\t\t\t</div>\n\t\t\t\t<div>lala</div>\n\t\t\t</div>\n\t\t\t<div>lala</div>\n\t\t"; $result_str = "\n\t\t\t\t<div>\n\t\t\t\t\t<div>lala</div>\n\t\t\t\t</div>\n\t\t\t\t<div>lala</div>\n\t\t"; $data = array(); $pp = new PageParser($str); $pp->DOMFind("/<div[^<>]*id\\s*=\\s*[\"']?main[\"']?[^<>]*>/", "/<\\/div>/", "/<div[^<>]*>/")->save($data); $this->assertEqual(trim($data), trim($result_str)); }
protected function getAboutXhtmlWikisource() { try { $content = $this->api->getPageAsync('MediaWiki:Wsexport_about')->wait(); } catch (Exception $e) { try { $oldWikisourceApi = new Api(''); $content = $oldWikisourceApi->getPageAsync('MediaWiki:Wsexport_about')->wait(); } catch (Exception $e) { $content = ''; } } if ($content !== '') { $document = new DOMDocument('1.0', 'UTF-8'); $document->loadXML($content); $parser = new PageParser($document); $document = $parser->getContent(true); $this->setTempFileContent('about.xhtml', str_replace('href="//', 'href="http://', $document->saveXML())); } }
function &ScanGallery(&$gallery, &$category, &$whitelisted, $all_images = FALSE) { require_once "{$GLOBALS['BASE_DIR']}/includes/http.class.php"; require_once "{$GLOBALS['BASE_DIR']}/includes/htmlparser.class.php"; // Setup default values $results = array('thumbnails' => 0, 'links' => 0, 'format' => FMT_PICTURES, 'has_recip' => FALSE, 'has_2257' => FALSE, 'thumbs' => array(), 'server_match' => TRUE); // Download the gallery page $http = new Http(); $http_result = $http->Get($gallery['gallery_url'], $whitelisted['allow_redirect']); // Record the request results $results = array_merge($results, $http->request_info); $results['page_hash'] = md5($http->body); $results['gallery_ip'] = GetIpFromUrl($http->end_url); $results['bytes'] = intval($results['size_download']); $results['html'] = $http->body; $results['headers'] = trim($http->raw_response_headers); $results['status'] = $http->response_headers['status']; $results['success'] = $http_result; $results['errstr'] = $http->errstr; $results['end_url'] = $http->end_url; if (!$http_result) { $http_result = null; return $results; } // Check if reciprocal link and 2257 code are present $results['has_recip'] = CheckReciprocal($http->body); $results['has_2257'] = Check2257($http->body); // Extract information from the gallery HTML $parser = new PageParser($http->end_url, $category['pics_extensions'], $category['movies_extensions']); $parser->parse($http->body); $results['links'] = $parser->num_links; if ($parser->num_content_links > 0) { if ($parser->num_picture_links > $parser->num_movie_links) { $results['format'] = FMT_PICTURES; $results['thumbnails'] = $parser->num_picture_links; $results['preview'] = $parser->thumbs['pictures'][array_rand($parser->thumbs['pictures'])]['full']; $results['thumbs'] = array_values($parser->thumbs['pictures']); } else { $results['format'] = FMT_MOVIES; $results['thumbnails'] = $parser->num_movie_links; $results['preview'] = $parser->thumbs['movies'][array_rand($parser->thumbs['movies'])]['full']; $results['thumbs'] = array_values($parser->thumbs['movies']); } } else { if ($all_images) { $results['thumbnails'] = count($parser->images); $results['preview'] = $parser->images[array_rand($parser->images)]['full']; $results['thumbs'] = array_values($parser->images); } } // Check that gallery content is hosted on same server as the gallery itself $parsed_gallery_url = parse_url($results['end_url']); $parsed_gallery_url['host'] = preg_quote(preg_replace('~^www\\.~', '', $parsed_gallery_url['host'])); foreach ($results['thumbs'] as $thumb) { $parsed_content_url = parse_url($thumb['content']); if (!preg_match("~{$parsed_gallery_url['host']}~", $parsed_content_url['host'])) { $results['server_match'] = FALSE; break; } } $parser->Cleanup(); unset($parser); $http->Cleanup(); unset($http); return $results; }
public function getMetadata($title, $isMetadata, DOMDocument $doc) { $page_list = [$title]; $parser = new PageParser($doc); $book = new Book(); $book->options = $this->options; $book->title = $title; $book->lang = $this->api->lang; $metadataSrc = $parser->getMetadata('ws-metadata'); if ($metadataSrc == '') { $metadataSrc = $title; $metadataParser = $parser; } else { $doc = $this->getDocument($metadataSrc); $metadataParser = new PageParser($doc); } $book->type = $metadataParser->getMetadata('ws-type'); $book->name = htmlspecialchars($metadataParser->getMetadata('ws-title')); if ($book->name == '') { $book->name = str_replace('_', ' ', $metadataSrc); } $book->periodical = htmlspecialchars($metadataParser->getMetadata('ws-periodical')); $book->author = htmlspecialchars($metadataParser->getMetadata('ws-author')); $book->translator = htmlspecialchars($metadataParser->getMetadata('ws-translator')); $book->illustrator = htmlspecialchars($metadataParser->getMetadata('ws-illustrator')); $book->school = htmlspecialchars($metadataParser->getMetadata('ws-school')); $book->publisher = htmlspecialchars($metadataParser->getMetadata('ws-publisher')); $book->year = htmlspecialchars($metadataParser->getMetadata('ws-year')); $book->place = htmlspecialchars($metadataParser->getMetadata('ws-place')); $book->key = $metadataParser->getMetadata('ws-key'); $book->progress = $metadataParser->getMetadata('ws-progress'); $book->volume = $metadataParser->getMetadata('ws-volume'); $book->scan = str_replace(' ', '_', $metadataParser->getMetadata('ws-scan')); $pictures = []; if ($this->options['images'] || $isMetadata) { $book->cover = $metadataParser->getMetadata('ws-cover'); if ($book->cover != '') { $pictures[$book->cover] = $this->getCover($book->cover, $book->lang); if ($pictures[$book->cover]->url == '') { $book->cover = ''; } } } if ($this->options['categories']) { $book->categories = $this->getCategories($metadataSrc); } $pageTitles = $parser->getPagesList(); $namespaces = $this->getNamespaces(); if (!$isMetadata) { if (!$parser->metadataIsSet('ws-noinclude')) { $book->content = $parser->getContent(true); if ($this->options['images']) { $pictures = array_merge($pictures, $parser->getPicturesList()); } } $chapterTitles = $parser->getFullChaptersList($title, $page_list, $namespaces); $chapters = $this->getPages($chapterTitles); foreach ($chapters as $chapter_key => $chapter) { $parser = new PageParser($chapter->content); if ($parser->metadataIsSet('ws-noinclude')) { unset($chapters[$chapter_key]); continue; } $pageTitles = array_merge($pageTitles, $parser->getPagesList()); $chapter->content = $parser->getContent(false); if ($this->options['images']) { $pictures = array_merge($pictures, $parser->getPicturesList()); } $subpagesTitles = $parser->getChaptersList($chapter, $page_list, $namespaces); if (!empty($subpagesTitles)) { $subpages = $this->getPages($subpagesTitles); foreach ($subpages as $subpage_key => $subpage) { $parser = new PageParser($subpage->content); if ($parser->metadataIsSet('ws-noinclude')) { unset($chapters[$subpage_key]); continue; } $pageTitles = array_merge($pageTitles, $parser->getPagesList()); $subpage->content = $parser->getContent(false); if ($this->options['images']) { $pictures = array_merge($pictures, $parser->getPicturesList()); } } $chapterTitles = array_merge($chapterTitles, $subpagesTitles); $chapter->chapters = $subpages; } } $book->chapters = $chapters; if ($this->options['credits']) { $creditPromises = $this->startCredits($book, $chapterTitles, $pageTitles, $pictures); } $pictures = $this->getPicturesData($pictures); if (!empty($creditPromises)) { $book->credits = $this->finishCredit($creditPromises); } } $book->pictures = $pictures; return $book; }
/** * Extract the site title and description from HTML tags */ function lxExtractSiteInfo() { global $json, $DB, $C; require_once "{$GLOBALS['BASE_DIR']}/includes/htmlparser.class.php"; $link = array('site_url' => $_REQUEST['url'], 'allow_redirect' => TRUE, 'recip_url' => null); $result = ScanLink($link); if ($result['site_url']['working']) { $parser = new PageParser(); $parser->parse($result['site_url']['html']); $title = mb_convert_encoding($parser->title, 'ISO-8859-1', mb_detect_encoding($parser->title, 'auto')); $description = mb_convert_encoding($parser->description, 'ISO-8859-1', mb_detect_encoding($parser->description, 'auto')); $keywords = mb_convert_encoding($parser->keywords, 'ISO-8859-1', mb_detect_encoding($parser->keywords, 'auto')); echo $json->encode(array('status' => JSON_SUCCESS, 'title' => html_entity_decode(trim($title)), 'description' => html_entity_decode(trim($description)), 'keywords' => trim(FormatKeywords(html_entity_decode($keywords))))); } else { echo $json->encode(array('status' => JSON_FAILURE)); } }
$exception = $exceptions['broken']; } } else { $exception = $exceptions['connect']; } } else { // No reciprocal link found if ($link['recip_required'] && !$scan_result['site_url']['has_recip'] && !$scan_result['recip_url']['has_recip']) { $exception |= $exceptions['norecip']; } // Check the blacklist if (($blacklisted = CheckBlacklistLink($link)) !== FALSE) { $exception |= $exceptions['blacklist']; $scan_result['blacklist_item'] = $blacklisted[0]['match']; } $parser = new PageParser(); $parser->parse($scan_result['site_url']['html']); $extracted_title = html_entity_decode(trim($parser->title)); $extracted_description = html_entity_decode(trim($parser->description)); $extracted_keywords = trim(FormatKeywords(html_entity_decode($parser->keywords))); if ($configuration['process_get_title'] && IsEmptyString($link['title'])) { $updates['placeholders'][] = '#=?'; $updates['binds'][] = 'title'; $updates['binds'][] = $extracted_title; } if ($configuration['process_get_description'] && IsEmptyString($link['description'])) { $updates['placeholders'][] = '#=?'; $updates['binds'][] = 'description'; $updates['binds'][] = $extracted_description; } if ($configuration['process_get_keywords'] && IsEmptyString($link['keywords'])) {