public function parse(Session $session) { if (!$this->xpathOutgoing) { Utils::throw400("Must set 'xpathOutgoing'"); } // the list of xpaths to find outgoing links, ordered by level of hierarchy $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing); // Let the HTMLParser parse, so we have a DOM parent::parse($session); // The urls to iterate through in this level of hierarchy $crawlUrls = array($session->url); // Step through the outgoing link xpaths for ($i = 0; $i < count($xpathOutgoingList); $i++) { $nextLevelUrls = array(); $thisLevelXpath = $xpathOutgoingList[$i]; foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); // Query for URLs of pages to further recurse $outLinkNodes = $subsession->xpath->query($thisLevelXpath); if ($outLinkNodes === false) { throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } else { if ($outLinkNodes->length === 0) { throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } } foreach ($outLinkNodes as $outLinkNode) { $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent); } } $crawlUrls = $nextLevelUrls; } // Concatenate all the <body> elements into the original document foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true); $session->dom->documentElement->appendChild($newBody); } $session->dom->save('/tmp/test3.html'); }
function fetch(Session $session) { // libxml_use_internal_errors(); $session->url = $session->url . '?_fb_noscript=1'; parent::fetch($session); // Uncomment everything $session->bytes = preg_replace('/<script.*?<\\/script>/', '', $session->bytes); // $session->bytes = preg_replace('/<!--/', '', $session->bytes); // $session->bytes = preg_replace('/-->/', '', $session->bytes); file_put_contents('/tmp/fb-clean.html', $session->bytes); }