/** * Scrape the given topic url for all the posts in a topic and return as xml. */ private function scrape_topic($link) { $scraper = new Scraper($link); $scraper->run(); $topic_scraper = new GoogleGroupsTopicScraper($scraper->html); $topic = $topic_scraper->run(); $i = 0; $xml = ''; if (is_array($topic)) { foreach ($topic as $detail) { $xml .= " <post idx=\"{$i}\">\n"; $xml .= ' <author>' . $detail['author'] . "</author>\n"; $xml .= ' <email>' . $detail['email'] . "</email>\n"; $xml .= ' <date>' . $detail['date'] . "</date>\n"; $xml .= ' <timestamp>' . $detail['timestamp'] . "</timestamp>\n"; $xml .= " <body>\n"; $xml .= "<![CDATA[\n" . $detail['body'] . "\n]]>\n"; $xml .= " </body>\n"; $xml .= " </post>\n"; $i++; } } else { print "ERROR: bad topic (url={$link})\n"; } return $xml; }
// curl_setopt($ch, CURLOPT_AUTOREFERER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, true); curl_setopt($ch, CURLOPT_CURLOPT_MAXREDIRS, 10); $this->result = curl_exec($ch); curl_close($ch); } private function exec_FGC() { $this->result = file_get_contents($this->url); } public function run() { switch (CURL_ENABLED) { case true: $this->exec_CURL(); break; case false: $this->exec_FGC(); break; } } } $url = $_GET['url']; $scraper = new Scraper($url); $scraper->run(); echo '<pre>'; print_r($scraper->result); echo '</pre>';