/** * runs crawler for given settings, returns array of files * TODO update to new stuff */ function runCrawler($FILE, $FILES_CACHE) { global $SETTINGS, $LAYOUT; $urlToCrawl = isset($SETTINGS[PSNG_CRAWLER_URL]) && $SETTINGS[PSNG_CRAWLER_URL] != $SETTINGS[PSNG_WEBSITE] ? $SETTINGS[PSNG_CRAWLER_URL] : $SETTINGS[PSNG_WEBSITE]; $url = parse_url($urlToCrawl); $path = $url['path']; if (substr($urlToCrawl, -1) != '/' && $url['path'] == '') { $path .= '/'; $urlToCrawl .= '/'; } // check if we have a already started scan debug($SETTINGS[PSNG_TIMEOUT], 'PSNG_TIMEOUT'); if (isset($SETTINGS[PSNG_TIMEOUT_TODO])) { debug($SETTINGS[PSNG_TIMEOUT_TODO], 'PSNG_TIMEOUT_TODO'); } # !!! 'repair' may not be correct mk/2005-11-08 if ($SETTINGS[PSNG_TIMEOUT] != PSNG_TIMEOUT_NONE && isset($SETTINGS[PSNG_TIMEOUT_TODO])) { // check if we're running in TIMEOUT mode debug('', "Running crawler engine from last point"); $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]); $crawler->setTodo($SETTINGS[PSNG_TIMEOUT_TODO]); $crawler->setDone($SETTINGS[PSNG_TIMEOUT_DONE]); $crawler->setFiles($SETTINGS[PSNG_TIMEOUT_FILE]); } else { // we are not in timeout mode, no rerun $crawler = new Crawler($urlToCrawl, $SETTINGS[PSNG_TIMEOUT_TIME_DEADLINE]); $crawler->setTodo(array($urlToCrawl)); } $crawler->setForbiddenKeys($SETTINGS[PSNG_DISALLOW_KEY]); $crawler->setForbiddenDirectories($SETTINGS[PSNG_DISALLOW_DIR]); $crawler->setForbiddenFiles($SETTINGS[PSNG_DISALLOW_FILE]); //Set the directory to forbid the crawler to follow below it $crawler->setDirectory($path); $crawler->start(); if (!$crawler->hasFinished()) { // store current data into session $SETTINGS[PSNG_TIMEOUT_TODO] = $crawler->getTodo(); $SETTINGS[PSNG_TIMEOUT_DONE] = $crawler->getDone(); $SETTINGS[PSNG_TIMEOUT_FILE] = $crawler->getFiles(); $SETTINGS[PSNG_TIMEOUT_ACTION] = PSNG_TIMEOUT_ACTION_WEBSITE; } else { while ($crawler->hasNext()) { $fileinfo = $crawler->getNext(); // returns an array if (!isset($fileinfo['http_status'])) { $fileinfo['http_status'] = ''; } if (!isset($fileinfo['file'])) { $fileinfo['file'] = ''; } if (!isset($fileinfo['lastmod'])) { $fileinfo['lastmod'] = ''; } if (!isset($fileinfo['changefreq'])) { $fileinfo['changefreq'] = ''; } if (!isset($fileinfo['priority'])) { $fileinfo['priority'] = ''; } $http_status = $fileinfo['http_status']; // create and setup valid values $fileinfo = handleURL($fileinfo['file'], $fileinfo['lastmod'], $fileinfo['changefreq'], $fileinfo['priority']); $fileinfo = handleURLCached($FILES_CACHE, $fileinfo); // handle some website specific stuff if ($http_status == "404") { $fileinfo[PSNG_FILE_ENABLED] = ''; $fileinfo[PSNG_HTML_STATUS] = 'class="notfound"'; } // info($fileinfo, 'Fileinfo from crawler'); // handle if the file exists on filesystem and on website if (array_key_exists($fileinfo[PSNG_FILE_URL], $FILE)) { $fileinfo = handleDoubleEntryFilesystemWebsite($FILE[$fileinfo[PSNG_FILE_URL]], $fileinfo); } // info($fileinfo, 'Fileinfo after handle double entry'); if (isset($fileinfo[PSNG_HTML_SOURCE])) { if ($fileinfo[PSNG_HTML_SOURCE] == PSNG_HTML_SOURCE_FS) { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_FS_WEBSITE; } else { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE; } } else { $fileinfo[PSNG_HTML_SOURCE] = PSNG_HTML_SOURCE_WEBSITE; } $FILE[$fileinfo[PSNG_FILE_URL]] = $fileinfo; } $SETTINGS[PSNG_TIMEOUT_ACTION] = ''; } return $FILE; }