PageParser PHP Code Examples

Example #1

0

Show file

File: run.php Project: justangel/pageparser

 public function testDOMFindMethod()
 {
     $str = "\n\t\t\t<div id='main'>\n\t\t\t\t<div>\n\t\t\t\t\t<div>lala</div>\n\t\t\t\t</div>\n\t\t\t\t<div>lala</div>\n\t\t\t</div>\n\t\t\t<div>lala</div>\n\t\t";
     $result_str = "\n\t\t\t\t<div>\n\t\t\t\t\t<div>lala</div>\n\t\t\t\t</div>\n\t\t\t\t<div>lala</div>\n\t\t";
     $data = array();
     $pp = new PageParser($str);
     $pp->DOMFind("/<div[^<>]*id\\s*=\\s*[\"']?main[\"']?[^<>]*>/", "/<\\/div>/", "/<div[^<>]*>/")->save($data);
     $this->assertEqual(trim($data), trim($result_str));
 }

Example #2

0

Show file

File: Refresh.php Project: jberkel/tool

 protected function getAboutXhtmlWikisource()
 {
     try {
         $content = $this->api->getPageAsync('MediaWiki:Wsexport_about')->wait();
     } catch (Exception $e) {
         try {
             $oldWikisourceApi = new Api('');
             $content = $oldWikisourceApi->getPageAsync('MediaWiki:Wsexport_about')->wait();
         } catch (Exception $e) {
             $content = '';
         }
     }
     if ($content !== '') {
         $document = new DOMDocument('1.0', 'UTF-8');
         $document->loadXML($content);
         $parser = new PageParser($document);
         $document = $parser->getContent(true);
         $this->setTempFileContent('about.xhtml', str_replace('href="//', 'href="http://', $document->saveXML()));
     }
 }

Example #3

0

Show file

File: common.php Project: Cyberspace-Networks/TGPX

function &ScanGallery(&$gallery, &$category, &$whitelisted, $all_images = FALSE)
{
    require_once "{$GLOBALS['BASE_DIR']}/includes/http.class.php";
    require_once "{$GLOBALS['BASE_DIR']}/includes/htmlparser.class.php";
    // Setup default values
    $results = array('thumbnails' => 0, 'links' => 0, 'format' => FMT_PICTURES, 'has_recip' => FALSE, 'has_2257' => FALSE, 'thumbs' => array(), 'server_match' => TRUE);
    // Download the gallery page
    $http = new Http();
    $http_result = $http->Get($gallery['gallery_url'], $whitelisted['allow_redirect']);
    // Record the request results
    $results = array_merge($results, $http->request_info);
    $results['page_hash'] = md5($http->body);
    $results['gallery_ip'] = GetIpFromUrl($http->end_url);
    $results['bytes'] = intval($results['size_download']);
    $results['html'] = $http->body;
    $results['headers'] = trim($http->raw_response_headers);
    $results['status'] = $http->response_headers['status'];
    $results['success'] = $http_result;
    $results['errstr'] = $http->errstr;
    $results['end_url'] = $http->end_url;
    if (!$http_result) {
        $http_result = null;
        return $results;
    }
    // Check if reciprocal link and 2257 code are present
    $results['has_recip'] = CheckReciprocal($http->body);
    $results['has_2257'] = Check2257($http->body);
    // Extract information from the gallery HTML
    $parser = new PageParser($http->end_url, $category['pics_extensions'], $category['movies_extensions']);
    $parser->parse($http->body);
    $results['links'] = $parser->num_links;
    if ($parser->num_content_links > 0) {
        if ($parser->num_picture_links > $parser->num_movie_links) {
            $results['format'] = FMT_PICTURES;
            $results['thumbnails'] = $parser->num_picture_links;
            $results['preview'] = $parser->thumbs['pictures'][array_rand($parser->thumbs['pictures'])]['full'];
            $results['thumbs'] = array_values($parser->thumbs['pictures']);
        } else {
            $results['format'] = FMT_MOVIES;
            $results['thumbnails'] = $parser->num_movie_links;
            $results['preview'] = $parser->thumbs['movies'][array_rand($parser->thumbs['movies'])]['full'];
            $results['thumbs'] = array_values($parser->thumbs['movies']);
        }
    } else {
        if ($all_images) {
            $results['thumbnails'] = count($parser->images);
            $results['preview'] = $parser->images[array_rand($parser->images)]['full'];
            $results['thumbs'] = array_values($parser->images);
        }
    }
    // Check that gallery content is hosted on same server as the gallery itself
    $parsed_gallery_url = parse_url($results['end_url']);
    $parsed_gallery_url['host'] = preg_quote(preg_replace('~^www\\.~', '', $parsed_gallery_url['host']));
    foreach ($results['thumbs'] as $thumb) {
        $parsed_content_url = parse_url($thumb['content']);
        if (!preg_match("~{$parsed_gallery_url['host']}~", $parsed_content_url['host'])) {
            $results['server_match'] = FALSE;
            break;
        }
    }
    $parser->Cleanup();
    unset($parser);
    $http->Cleanup();
    unset($http);
    return $results;
}

Example #4

0

Show file

File: BookProvider.php Project: jberkel/tool

 public function getMetadata($title, $isMetadata, DOMDocument $doc)
 {
     $page_list = [$title];
     $parser = new PageParser($doc);
     $book = new Book();
     $book->options = $this->options;
     $book->title = $title;
     $book->lang = $this->api->lang;
     $metadataSrc = $parser->getMetadata('ws-metadata');
     if ($metadataSrc == '') {
         $metadataSrc = $title;
         $metadataParser = $parser;
     } else {
         $doc = $this->getDocument($metadataSrc);
         $metadataParser = new PageParser($doc);
     }
     $book->type = $metadataParser->getMetadata('ws-type');
     $book->name = htmlspecialchars($metadataParser->getMetadata('ws-title'));
     if ($book->name == '') {
         $book->name = str_replace('_', ' ', $metadataSrc);
     }
     $book->periodical = htmlspecialchars($metadataParser->getMetadata('ws-periodical'));
     $book->author = htmlspecialchars($metadataParser->getMetadata('ws-author'));
     $book->translator = htmlspecialchars($metadataParser->getMetadata('ws-translator'));
     $book->illustrator = htmlspecialchars($metadataParser->getMetadata('ws-illustrator'));
     $book->school = htmlspecialchars($metadataParser->getMetadata('ws-school'));
     $book->publisher = htmlspecialchars($metadataParser->getMetadata('ws-publisher'));
     $book->year = htmlspecialchars($metadataParser->getMetadata('ws-year'));
     $book->place = htmlspecialchars($metadataParser->getMetadata('ws-place'));
     $book->key = $metadataParser->getMetadata('ws-key');
     $book->progress = $metadataParser->getMetadata('ws-progress');
     $book->volume = $metadataParser->getMetadata('ws-volume');
     $book->scan = str_replace(' ', '_', $metadataParser->getMetadata('ws-scan'));
     $pictures = [];
     if ($this->options['images'] || $isMetadata) {
         $book->cover = $metadataParser->getMetadata('ws-cover');
         if ($book->cover != '') {
             $pictures[$book->cover] = $this->getCover($book->cover, $book->lang);
             if ($pictures[$book->cover]->url == '') {
                 $book->cover = '';
             }
         }
     }
     if ($this->options['categories']) {
         $book->categories = $this->getCategories($metadataSrc);
     }
     $pageTitles = $parser->getPagesList();
     $namespaces = $this->getNamespaces();
     if (!$isMetadata) {
         if (!$parser->metadataIsSet('ws-noinclude')) {
             $book->content = $parser->getContent(true);
             if ($this->options['images']) {
                 $pictures = array_merge($pictures, $parser->getPicturesList());
             }
         }
         $chapterTitles = $parser->getFullChaptersList($title, $page_list, $namespaces);
         $chapters = $this->getPages($chapterTitles);
         foreach ($chapters as $chapter_key => $chapter) {
             $parser = new PageParser($chapter->content);
             if ($parser->metadataIsSet('ws-noinclude')) {
                 unset($chapters[$chapter_key]);
                 continue;
             }
             $pageTitles = array_merge($pageTitles, $parser->getPagesList());
             $chapter->content = $parser->getContent(false);
             if ($this->options['images']) {
                 $pictures = array_merge($pictures, $parser->getPicturesList());
             }
             $subpagesTitles = $parser->getChaptersList($chapter, $page_list, $namespaces);
             if (!empty($subpagesTitles)) {
                 $subpages = $this->getPages($subpagesTitles);
                 foreach ($subpages as $subpage_key => $subpage) {
                     $parser = new PageParser($subpage->content);
                     if ($parser->metadataIsSet('ws-noinclude')) {
                         unset($chapters[$subpage_key]);
                         continue;
                     }
                     $pageTitles = array_merge($pageTitles, $parser->getPagesList());
                     $subpage->content = $parser->getContent(false);
                     if ($this->options['images']) {
                         $pictures = array_merge($pictures, $parser->getPicturesList());
                     }
                 }
                 $chapterTitles = array_merge($chapterTitles, $subpagesTitles);
                 $chapter->chapters = $subpages;
             }
         }
         $book->chapters = $chapters;
         if ($this->options['credits']) {
             $creditPromises = $this->startCredits($book, $chapterTitles, $pageTitles, $pictures);
         }
         $pictures = $this->getPicturesData($pictures);
         if (!empty($creditPromises)) {
             $book->credits = $this->finishCredit($creditPromises);
         }
     }
     $book->pictures = $pictures;
     return $book;
 }

Example #5

0

Show file

File: ajax.php Project: hackingman/LinkX

/**
* Extract the site title and description from HTML tags
*/
function lxExtractSiteInfo()
{
    global $json, $DB, $C;
    require_once "{$GLOBALS['BASE_DIR']}/includes/htmlparser.class.php";
    $link = array('site_url' => $_REQUEST['url'], 'allow_redirect' => TRUE, 'recip_url' => null);
    $result = ScanLink($link);
    if ($result['site_url']['working']) {
        $parser = new PageParser();
        $parser->parse($result['site_url']['html']);
        $title = mb_convert_encoding($parser->title, 'ISO-8859-1', mb_detect_encoding($parser->title, 'auto'));
        $description = mb_convert_encoding($parser->description, 'ISO-8859-1', mb_detect_encoding($parser->description, 'auto'));
        $keywords = mb_convert_encoding($parser->keywords, 'ISO-8859-1', mb_detect_encoding($parser->keywords, 'auto'));
        echo $json->encode(array('status' => JSON_SUCCESS, 'title' => html_entity_decode(trim($title)), 'description' => html_entity_decode(trim($description)), 'keywords' => trim(FormatKeywords(html_entity_decode($keywords)))));
    } else {
        echo $json->encode(array('status' => JSON_FAILURE));
    }
}

Example #6

0

Show file

File: scanner.php Project: hackingman/LinkX

             $exception = $exceptions['broken'];
         }
     } else {
         $exception = $exceptions['connect'];
     }
 } else {
     // No reciprocal link found
     if ($link['recip_required'] && !$scan_result['site_url']['has_recip'] && !$scan_result['recip_url']['has_recip']) {
         $exception |= $exceptions['norecip'];
     }
     // Check the blacklist
     if (($blacklisted = CheckBlacklistLink($link)) !== FALSE) {
         $exception |= $exceptions['blacklist'];
         $scan_result['blacklist_item'] = $blacklisted[0]['match'];
     }
     $parser = new PageParser();
     $parser->parse($scan_result['site_url']['html']);
     $extracted_title = html_entity_decode(trim($parser->title));
     $extracted_description = html_entity_decode(trim($parser->description));
     $extracted_keywords = trim(FormatKeywords(html_entity_decode($parser->keywords)));
     if ($configuration['process_get_title'] && IsEmptyString($link['title'])) {
         $updates['placeholders'][] = '#=?';
         $updates['binds'][] = 'title';
         $updates['binds'][] = $extracted_title;
     }
     if ($configuration['process_get_description'] && IsEmptyString($link['description'])) {
         $updates['placeholders'][] = '#=?';
         $updates['binds'][] = 'description';
         $updates['binds'][] = $extracted_description;
     }
     if ($configuration['process_get_keywords'] && IsEmptyString($link['keywords'])) {

PHP PageParser Examples