Exemple #1
0
 public function testCrawl()
 {
     $account = Account::login("searchzen.org", "test");
     $c = $account->collections[0];
     $crawler = new Crawler($c);
     $crawler->pageLimit = 10;
     $crawler->start();
 }
Exemple #2
0
                }
                foreach ($links as $link) {
                    $this->processLink($link);
                }
            }
            // Sleep until we're guaranteed to have something to crawl, but no less than 1 second.
            $sleepTime = 1 + max(0, min($this->accessTimes) + Config::get('crawler.t_wait') - time());
            Applog::log("Sleeping for {$sleepTime} seconds");
            sleep($sleepTime);
        }
    }
    function start()
    {
        Applog::log("Crawler started");
        // Salvam întregul whiteList in tabelul Link pentru a incepe extragerea.
        // Aceste URL-uri nu vor avea o pagina din care sunt descoperite, deci crawledPageId va avea valoarea 0.
        foreach (Config::get('crawler.whiteList') as $startUrl) {
            $startUrl = StringUtil::urlCleanup($startUrl, $this->directoryIndexFile, $this->indexFileExt);
            $rec = StringUtil::parseUtf8Url($startUrl);
            Link::saveLink2DB($startUrl, $rec['host'], 0);
        }
        $this->crawlLoop();
    }
}
/*
 *  Obiectul nu va fi creat daca acest fisier nu va fi fisier cautat
 */
if (strstr($_SERVER['SCRIPT_NAME'], 'Crawler.php')) {
    $obj = new Crawler();
    $obj->start();
}
Exemple #3
0
 public function crawl($domain)
 {
     $crawler = new Crawler($this->getAccountId($domain));
     $crawler->start();
 }
Exemple #4
0
<?php

/**
 * Author: Will Smelser
 * Date: 1/10/14
 * Time: 11:09 PM
 * Project: openProjects
 */
error_reporting(E_ALL);
include 'required/class/Crawler.php';
$crawler = new Crawler('http://simple-seo-api.local', 'http://openprojects.local/crawler/required/loaders/Links.php', true, 999, 10, 10, 30);
$result = $crawler->start();
var_dump($result);
/**
 * runs crawler for given settings, returns array of files
 * TODO update to new stuff
 */
function runCrawler($FILE, $FILES_CACHE)
{
    global $SETTINGS, $LAYOUT;
    $urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE];
    $url = parse_url($urlToCrawl);
    $path = $url['path'];
    if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') {
        $path .= '/';
        $urlToCrawl .= '/';
    }
    // check if we have a already started scan
    debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT');
    if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
        debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO');
    }
    # !!! 'repair' may not be correct mk/2005-11-08
    if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
        // check if we're running in TIMEOUT mode
        debug('', "Running crawler engine from last point");
        $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
        $crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]);
        $crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]);
        $crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]);
    } else {
        // we are not in timeout mode, no rerun
        $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
        $crawler->setTodo(array($urlToCrawl));
    }
    $crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]);
    $crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]);
    $crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]);
    //Set the directory to forbid the crawler to follow below it
    $crawler->setDirectory($path);
    $crawler->start();
    if (!$crawler->hasFinished()) {
        // store current data into session
        $SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo();
        $SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone();
        $SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles();
        $SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE;
    } else {
        while ($crawler->hasNext()) {
            $fileinfo = $crawler->getNext();
            // returns an array
            if (!isset($fileinfo['http_status'])) {
                $fileinfo['http_status'] = '';
            }
            if (!isset($fileinfo['file'])) {
                $fileinfo['file'] = '';
            }
            if (!isset($fileinfo['lastmod'])) {
                $fileinfo['lastmod'] = '';
            }
            if (!isset($fileinfo['changefreq'])) {
                $fileinfo['changefreq'] = '';
            }
            if (!isset($fileinfo['priority'])) {
                $fileinfo['priority'] = '';
            }
            $http_status = $fileinfo['http_status'];
            // create and setup valid values
            $fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']);
            $fileinfo = handleURLCached($FILES_CACHE, $fileinfo);
            // handle some website specific stuff
            if ($http_status == "404") {
                $fileinfo[PSNG_FILE_ENABLED] = '';
                $fileinfo[PSNG_HTML_STATUS] = 'class="notfound"';
            }
            //			info($fileinfo, 'Fileinfo from crawler');
            // handle if the file exists on filesystem and on website
            if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) {
                $fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo);
            }
            //			info($fileinfo, 'Fileinfo after handle double entry');
            if (isset($fileinfo[PSNG_HTML_SOURCE])) {
                if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) {
                    $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE;
                } else {
                    $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
                }
            } else {
                $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
            }
            $FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo;
        }
        $SETTINGS[PSNG_TIMEOUT_ACTION] = '';
    }
    return $FILE;
}