/**
 * Perform one cron 'tick' of crawl processing
 *
 * Has limits of both how many urls to crawl
 * and a soft time limit on total crawl time.
 */
function local_linkchecker_robot_crawl($verbose = false)
{
    global $CFG, $DB;
    $robot = new \local_linkchecker_robot\robot\crawler();
    $config = $robot::get_config();
    $crawlstart = $config->crawlstart;
    $crawlend = $config->crawlend;
    // Check if valid, otherwise bail quickly.
    // If we need to start a new crawl, push the seed url into the crawl queue.
    if (!$crawlstart || $crawlstart <= $crawlend) {
        $start = time();
        set_config('crawlstart', $start, 'local_linkchecker_robot');
        $robot->mark_for_crawl($CFG->wwwroot . '/', $config->seedurl);
        // Create a new history record.
        $history = new stdClass();
        $history->startcrawl = $start;
        $history->urls = 0;
        $history->links = 0;
        $history->broken = 0;
        $history->oversize = 0;
        $history->cronticks = 0;
        $history->id = $DB->insert_record('linkchecker_history', $history);
    } else {
        $history = $DB->get_record('linkchecker_history', array('startcrawl' => $crawlstart));
    }
    // While we are not exceeding the maxcron time, and the queue is not empty
    // find the next url in the queue and crawl it.
    // If the queue is empty then mark the crawl as ended.
    $cronstart = time();
    $cronstop = $cronstart + $config->maxcrontime;
    $hasmore = true;
    $hastime = true;
    while ($hasmore && $hastime) {
        $hasmore = $robot->process_queue($verbose);
        $hastime = time() < $cronstop;
        set_config('crawltick', time(), 'local_linkchecker_robot');
    }
    if ($hastime) {
        // Time left over, which means the queue is empty!
        // Mark the crawl as ended.
        $history->endcrawl = time();
        set_config('crawlend', time(), 'local_linkchecker_robot');
    }
    $history->urls = $robot->get_processed();
    $history->links = $robot->get_num_links();
    $history->broken = $robot->get_num_broken_urls();
    $history->oversize = $robot->get_num_oversize();
    $history->cronticks++;
    $DB->update_record('linkchecker_history', $history);
}
 * @copyright  Catalyst IT
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
 */
define('CLI_SCRIPT', true);
require dirname(dirname(dirname(dirname(__FILE__)))) . '/config.php';
require_once $CFG->libdir . '/clilib.php';
require_once $CFG->dirroot . '/local/linkchecker_robot/lib.php';
list($options, $unrecognized) = cli_get_params(array('help' => false, 'url' => null), array('h' => 'help', 'u' => 'url'));
if ($unrecognized) {
    $unrecognized = implode("\n  ", $unrecognized);
    cli_error(get_string('cliunknowoption', 'admin', $unrecognized));
}
$help = "Crawl a url as the robot and parse it.\n\nUseful for when a page has been corrected and you want to instantly reflect this.\n\nOptions:\n-h, --help      Print out this help\n-u, --url       Url to crawl and process\n\nExample:\n\$sudo -u www-data php crawl-as.php --url=http://ford.com/\n";
if ($options['help']) {
    echo $help;
    die;
}
$robot = new \local_linkchecker_robot\robot\crawler();
$error = $robot->is_bot_valid();
if ($error) {
    print "Error: {$error}";
    exit;
}
if (!$options['url']) {
    echo $help;
    die;
}
$url = $options['url'];
$node = $robot->mark_for_crawl($url, $url);
$res = $robot->scrape($url);
$result = $robot->crawl($node, 2);