/** * Perform one cron 'tick' of crawl processing * * Has limits of both how many urls to crawl * and a soft time limit on total crawl time. */ function local_linkchecker_robot_crawl($verbose = false) { global $CFG, $DB; $robot = new \local_linkchecker_robot\robot\crawler(); $config = $robot::get_config(); $crawlstart = $config->crawlstart; $crawlend = $config->crawlend; // Check if valid, otherwise bail quickly. // If we need to start a new crawl, push the seed url into the crawl queue. if (!$crawlstart || $crawlstart <= $crawlend) { $start = time(); set_config('crawlstart', $start, 'local_linkchecker_robot'); $robot->mark_for_crawl($CFG->wwwroot . '/', $config->seedurl); // Create a new history record. $history = new stdClass(); $history->startcrawl = $start; $history->urls = 0; $history->links = 0; $history->broken = 0; $history->oversize = 0; $history->cronticks = 0; $history->id = $DB->insert_record('linkchecker_history', $history); } else { $history = $DB->get_record('linkchecker_history', array('startcrawl' => $crawlstart)); } // While we are not exceeding the maxcron time, and the queue is not empty // find the next url in the queue and crawl it. // If the queue is empty then mark the crawl as ended. $cronstart = time(); $cronstop = $cronstart + $config->maxcrontime; $hasmore = true; $hastime = true; while ($hasmore && $hastime) { $hasmore = $robot->process_queue($verbose); $hastime = time() < $cronstop; set_config('crawltick', time(), 'local_linkchecker_robot'); } if ($hastime) { // Time left over, which means the queue is empty! // Mark the crawl as ended. $history->endcrawl = time(); set_config('crawlend', time(), 'local_linkchecker_robot'); } $history->urls = $robot->get_processed(); $history->links = $robot->get_num_links(); $history->broken = $robot->get_num_broken_urls(); $history->oversize = $robot->get_num_oversize(); $history->cronticks++; $DB->update_record('linkchecker_history', $history); }
*/ define('CLI_SCRIPT', true); require dirname(dirname(dirname(dirname(__FILE__)))) . '/config.php'; require_once $CFG->libdir . '/clilib.php'; require_once $CFG->dirroot . '/local/linkchecker_robot/lib.php'; list($options, $unrecognized) = cli_get_params(array('help' => false, 'url' => null), array('h' => 'help', 'u' => 'url')); if ($unrecognized) { $unrecognized = implode("\n ", $unrecognized); cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); } $help = "Scrape the url as the robot would see it, but do not process/queue it.\n\nOptions:\n-h, --help Print out this help\n-u, --url Url to scrape\n\nExample:\n\$sudo -u www-data php scrape-as.php --url=http://ford.com/\n"; if ($options['help']) { echo $help; die; } $robot = new \local_linkchecker_robot\robot\crawler(); $error = $robot->is_bot_valid(); if ($error) { print "Error: {$error}"; exit; } if (!$options['url']) { echo $help; die; } $url = $options['url']; $node = $robot->scrape($url); $dump = $node->contents; unset($node->contents); print $dump; var_dump($node);
require_capability('moodle/course:update', $coursecontext); $PAGE->set_context($coursecontext); $PAGE->set_url($navurl); $PAGE->set_pagelayout('admin'); $PAGE->set_title(get_string($report, 'local_linkchecker_robot')); $sqlfilter = ' AND c.id = ' . $courseid; } else { // If no course then this is an admin only report. require_capability('moodle/site:config', context_system::instance()); admin_externalpage_setup('local_linkchecker_robot_' . $report); } echo $OUTPUT->header(); require 'tabs.php'; echo $tabs; if ($retryid) { $robot = new \local_linkchecker_robot\robot\crawler(); $robot->reset_for_recrawl($retryid); } if ($report == 'broken') { $sql = " FROM {linkchecker_url} b\n LEFT JOIN {linkchecker_edge} l ON l.b = b.id\n LEFT JOIN {linkchecker_url} a ON l.a = a.id\n LEFT JOIN {course} c ON c.id = a.courseid\n WHERE b.httpcode != ? {$sqlfilter}"; $opts = array('200'); $data = $DB->get_records_sql("SELECT concat(b.id, '-', l.id, '-', a.id) AS id,\n b.url target,\n b.httpcode,\n b.httpmsg,\n b.lastcrawled,\n b.id AS toid,\n l.id linkid,\n l.text,\n a.url,\n a.title,\n a.redirect,\n a.courseid,\n c.shortname {$sql}\n ORDER BY httpcode DESC,\n c.shortname ASC", $opts, $start, $perpage); $count = $DB->get_field_sql("SELECT count(*) AS count" . $sql, $opts); $mdlw = strlen($CFG->wwwroot); $table = new html_table(); $table->head = array('', get_string('lastcrawledtime', 'local_linkchecker_robot'), get_string('response', 'local_linkchecker_robot'), get_string('broken', 'local_linkchecker_robot'), get_string('frompage', 'local_linkchecker_robot')); if (!$courseid) { array_push($table->head, get_string('course', 'local_linkchecker_robot')); } $table->data = array(); foreach ($data as $row) {
// along with Moodle. If not, see <http://www.gnu.org/licenses/>. /** * A link checker robot * * @package local_linkchecker_robot * @copyright 2016 Brendan Heywood <*****@*****.**> * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later */ require_once dirname(dirname(dirname(__FILE__))) . '/config.php'; require_once $CFG->libdir . '/adminlib.php'; require_login(); require_capability('moodle/site:config', context_system::instance()); admin_externalpage_setup('local_linkchecker_robot_status'); echo $OUTPUT->header(); $action = optional_param('action', '', PARAM_ALPHANUMEXT); $robot = new \local_linkchecker_robot\robot\crawler(); $config = $robot::get_config(); if ($action == 'makebot') { $botuser = $robot->auto_create_bot(); } $crawlstart = $config->crawlstart; $crawlend = $config->crawlend; $crawltick = $config->crawltick; $boterror = $robot->is_bot_valid(); $queuesize = $robot->get_queue_size(); $recent = $robot->get_processed(); $numlinks = $robot->get_num_links(); $oldqueuesize = $robot->get_old_queue_size(); $numurlsbroken = $robot->get_num_broken_urls(); $numpageswithurlsbroken = $robot->get_pages_withbroken_links(); $oversize = $robot->get_num_oversize();
* @copyright Catalyst IT * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later */ define('CLI_SCRIPT', true); require dirname(dirname(dirname(dirname(__FILE__)))) . '/config.php'; require_once $CFG->libdir . '/clilib.php'; require_once $CFG->dirroot . '/local/linkchecker_robot/lib.php'; list($options, $unrecognized) = cli_get_params(array('help' => false, 'url' => null), array('h' => 'help', 'u' => 'url')); if ($unrecognized) { $unrecognized = implode("\n ", $unrecognized); cli_error(get_string('cliunknowoption', 'admin', $unrecognized)); } $help = "Crawl a url as the robot and parse it.\n\nUseful for when a page has been corrected and you want to instantly reflect this.\n\nOptions:\n-h, --help Print out this help\n-u, --url Url to crawl and process\n\nExample:\n\$sudo -u www-data php crawl-as.php --url=http://ford.com/\n"; if ($options['help']) { echo $help; die; } $robot = new \local_linkchecker_robot\robot\crawler(); $error = $robot->is_bot_valid(); if ($error) { print "Error: {$error}"; exit; } if (!$options['url']) { echo $help; die; } $url = $options['url']; $node = $robot->mark_for_crawl($url, $url); $res = $robot->scrape($url); $result = $robot->crawl($node, 2);