public function testCrawl() { $account = Account::login("searchzen.org", "test"); $c = $account->collections[0]; $crawler = new Crawler($c); $crawler->pageLimit = 10; $crawler->start(); }
} foreach ($links as $link) { $this->processLink($link); } } // Sleep until we're guaranteed to have something to crawl, but no less than 1 second. $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time()); Applog::log("Sleeping for {$sleepTime} seconds"); sleep($sleepTime); } } function start() { Applog::log("Crawler started"); // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea. // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0. foreach (Config::get('crawler.whiteList') as $startUrl) { $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt); $rec = StringUtil::parseUtf8Url($startUrl); Link::saveLink2DB($startUrl, $rec['host'], 0); } $this->crawlLoop(); } } /* * Obiectul nu va fi creat daca acest fisier nu va fi fisier cautat */ if (strstr($_SERVER['SCRIPT_NAME'], 'Crawler.php')) { $obj = new Crawler(); $obj->start(); }
public function crawl($domain) { $crawler = new Crawler($this->getAccountId($domain)); $crawler->start(); }
<?php /** * Author: Will Smelser * Date: 1/10/14 * Time: 11:09 PM * Project: openProjects */ error_reporting(E_ALL); include 'required/class/Crawler.php'; $crawler = new Crawler('http://simple-seo-api.local', 'http://openprojects.local/crawler/required/loaders/Links.php', true, 999, 10, 10, 30); $result = $crawler->start(); var_dump($result);
/** * runs crawler for given settings, returns array of files * TODO update to new stuff */ function runCrawler($FILE, $FILES_CACHE) { global $SETTINGS, $LAYOUT; $urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE]; $url = parse_url($urlToCrawl); $path = $url['path']; if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') { $path .= '/'; $urlToCrawl .= '/'; } // check if we have a already started scan debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT'); if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) { debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO'); } # !!! 'repair' may not be correct mk/2005-11-08 if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) { // check if we're running in TIMEOUT mode debug('', "Running crawler engine from last point"); $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]); $crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]); $crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]); $crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]); } else { // we are not in timeout mode, no rerun $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]); $crawler->setTodo(array($urlToCrawl)); } $crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]); $crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]); $crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]); //Set the directory to forbid the crawler to follow below it $crawler->setDirectory($path); $crawler->start(); if (!$crawler->hasFinished()) { // store current data into session $SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo(); $SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone(); $SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles(); $SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE; } else { while ($crawler->hasNext()) { $fileinfo = $crawler->getNext(); // returns an array if (!isset($fileinfo['http_status'])) { $fileinfo['http_status'] = ''; } if (!isset($fileinfo['file'])) { $fileinfo['file'] = ''; } if (!isset($fileinfo['lastmod'])) { $fileinfo['lastmod'] = ''; } if (!isset($fileinfo['changefreq'])) { $fileinfo['changefreq'] = ''; } if (!isset($fileinfo['priority'])) { $fileinfo['priority'] = ''; } $http_status = $fileinfo['http_status']; // create and setup valid values $fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']); $fileinfo = handleURLCached($FILES_CACHE, $fileinfo); // handle some website specific stuff if ($http_status == "404") { $fileinfo[PSNG_FILE_ENABLED] = ''; $fileinfo[PSNG_HTML_STATUS] = 'class="notfound"'; } // info($fileinfo, 'Fileinfo from crawler'); // handle if the file exists on filesystem and on website if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) { $fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo); } // info($fileinfo, 'Fileinfo after handle double entry'); if (isset($fileinfo[PSNG_HTML_SOURCE])) { if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE; } else { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE; } } else { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE; } $FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo; } $SETTINGS[PSNG_TIMEOUT_ACTION] = ''; } return $FILE; }