コード例 #1
0
ファイル: parsing.inc.php プロジェクト: Tipkin-Commons/tipkin
/**
 * runs crawler for given settings, returns array of files
 * TODO update to new stuff
 */
function runCrawler($FILE, $FILES_CACHE)
{
    global $SETTINGS, $LAYOUT;
    $urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE];
    $url = parse_url($urlToCrawl);
    $path = $url['path'];
    if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') {
        $path .= '/';
        $urlToCrawl .= '/';
    }
    // check if we have a already started scan
    debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT');
    if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
        debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO');
    }
    # !!! 'repair' may not be correct mk/2005-11-08
    if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) {
        // check if we're running in TIMEOUT mode
        debug('', "Running crawler engine from last point");
        $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
        $crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]);
        $crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]);
        $crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]);
    } else {
        // we are not in timeout mode, no rerun
        $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]);
        $crawler->setTodo(array($urlToCrawl));
    }
    $crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]);
    $crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]);
    $crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]);
    //Set the directory to forbid the crawler to follow below it
    $crawler->setDirectory($path);
    $crawler->start();
    if (!$crawler->hasFinished()) {
        // store current data into session
        $SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo();
        $SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone();
        $SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles();
        $SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE;
    } else {
        while ($crawler->hasNext()) {
            $fileinfo = $crawler->getNext();
            // returns an array
            if (!isset($fileinfo['http_status'])) {
                $fileinfo['http_status'] = '';
            }
            if (!isset($fileinfo['file'])) {
                $fileinfo['file'] = '';
            }
            if (!isset($fileinfo['lastmod'])) {
                $fileinfo['lastmod'] = '';
            }
            if (!isset($fileinfo['changefreq'])) {
                $fileinfo['changefreq'] = '';
            }
            if (!isset($fileinfo['priority'])) {
                $fileinfo['priority'] = '';
            }
            $http_status = $fileinfo['http_status'];
            // create and setup valid values
            $fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']);
            $fileinfo = handleURLCached($FILES_CACHE, $fileinfo);
            // handle some website specific stuff
            if ($http_status == "404") {
                $fileinfo[PSNG_FILE_ENABLED] = '';
                $fileinfo[PSNG_HTML_STATUS] = 'class="notfound"';
            }
            //			info($fileinfo, 'Fileinfo from crawler');
            // handle if the file exists on filesystem and on website
            if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) {
                $fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo);
            }
            //			info($fileinfo, 'Fileinfo after handle double entry');
            if (isset($fileinfo[PSNG_HTML_SOURCE])) {
                if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) {
                    $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE;
                } else {
                    $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
                }
            } else {
                $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE;
            }
            $FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo;
        }
        $SETTINGS[PSNG_TIMEOUT_ACTION] = '';
    }
    return $FILE;
}