/** * Return an array of raw html notifications, delay in [s] */ private function getAndIndexNotifications($daysBack = 14, $delay = 0.5) { $date = new DateTime(); // DateTime::createFromFormat('d-m-Y', $enddate); date_sub($date, date_interval_create_from_date_string($daysBack . ' days')); $p = 0; $alreadyStoredPages = 0; // remove database entries older than given date $this->deleteEntriesInDatabase($date); $Scraper = new P2000Scraper("http://www.p2000-online.net/alleregiosf.html"); while ($this->entriesInDatabase($date) == 0) { //&& $alreadyStoredPages<5) { $Scraper->scrapePage(); $now = round(microtime(true) * 1000); $alreadyStored = $this->indexNotifications($Scraper->getRawNotifications()); $elapsed = round(microtime(true) * 1000) - $now; if ($elapsed < $delay * 1000.0) { // ensure proper delay between requests usleep(($delay - $elapsed / 1000.0) * 1000000); } $end = round(microtime(true) * 1000) - $now; if ($alreadyStored == 15) { $alreadyStoredPages++; } $Scraper->clearRawNotifications(); $Scraper->loadNextPage(); $p++; //echo "Scraped " . $p . " pages - Time elapsed: " . $elapsed . "[ms] <br/>"; // for webpage fwrite(STDOUT, "\n\tScraped " . $p . " pages - Time elapsed: " . $end . "[ms]\n"); // for CLI $amount = $this->entriesInDatabase($date); fwrite(STDOUT, $amount . " pages indexed of date: " . $date->format('d-m-Y') . "\n"); //->format('d-m-Y')."\n"); } }
<?php include_once 'P2000Scraper.php'; $Scraper = new P2000Scraper("http://www.p2000-online.net/alleregiosf.html"); $Scraper->scrapePages(10, 60 / 100.0); echo htmlspecialchars($Scraper->getRawNotifications()[0]); echo "<br/>"; echo "Count: " . count($Scraper->getRawNotifications());