コード例 #1
0
/**
 * Perform one cron 'tick' of crawl processing
 *
 * Has limits of both how many urls to crawl
 * and a soft time limit on total crawl time.
 */
function local_linkchecker_robot_crawl($verbose = false)
{
    global $CFG, $DB;
    $robot = new \local_linkchecker_robot\robot\crawler();
    $config = $robot::get_config();
    $crawlstart = $config->crawlstart;
    $crawlend = $config->crawlend;
    // Check if valid, otherwise bail quickly.
    // If we need to start a new crawl, push the seed url into the crawl queue.
    if (!$crawlstart || $crawlstart <= $crawlend) {
        $start = time();
        set_config('crawlstart', $start, 'local_linkchecker_robot');
        $robot->mark_for_crawl($CFG->wwwroot . '/', $config->seedurl);
        // Create a new history record.
        $history = new stdClass();
        $history->startcrawl = $start;
        $history->urls = 0;
        $history->links = 0;
        $history->broken = 0;
        $history->oversize = 0;
        $history->cronticks = 0;
        $history->id = $DB->insert_record('linkchecker_history', $history);
    } else {
        $history = $DB->get_record('linkchecker_history', array('startcrawl' => $crawlstart));
    }
    // While we are not exceeding the maxcron time, and the queue is not empty
    // find the next url in the queue and crawl it.
    // If the queue is empty then mark the crawl as ended.
    $cronstart = time();
    $cronstop = $cronstart + $config->maxcrontime;
    $hasmore = true;
    $hastime = true;
    while ($hasmore && $hastime) {
        $hasmore = $robot->process_queue($verbose);
        $hastime = time() < $cronstop;
        set_config('crawltick', time(), 'local_linkchecker_robot');
    }
    if ($hastime) {
        // Time left over, which means the queue is empty!
        // Mark the crawl as ended.
        $history->endcrawl = time();
        set_config('crawlend', time(), 'local_linkchecker_robot');
    }
    $history->urls = $robot->get_processed();
    $history->links = $robot->get_num_links();
    $history->broken = $robot->get_num_broken_urls();
    $history->oversize = $robot->get_num_oversize();
    $history->cronticks++;
    $DB->update_record('linkchecker_history', $history);
}
 */
define('CLI_SCRIPT', true);
require dirname(dirname(dirname(dirname(__FILE__)))) . '/config.php';
require_once $CFG->libdir . '/clilib.php';
require_once $CFG->dirroot . '/local/linkchecker_robot/lib.php';
list($options, $unrecognized) = cli_get_params(array('help' => false, 'url' => null), array('h' => 'help', 'u' => 'url'));
if ($unrecognized) {
    $unrecognized = implode("\n  ", $unrecognized);
    cli_error(get_string('cliunknowoption', 'admin', $unrecognized));
}
$help = "Scrape the url as the robot would see it, but do not process/queue it.\n\nOptions:\n-h, --help      Print out this help\n-u, --url       Url to scrape\n\nExample:\n\$sudo -u www-data php scrape-as.php --url=http://ford.com/\n";
if ($options['help']) {
    echo $help;
    die;
}
$robot = new \local_linkchecker_robot\robot\crawler();
$error = $robot->is_bot_valid();
if ($error) {
    print "Error: {$error}";
    exit;
}
if (!$options['url']) {
    echo $help;
    die;
}
$url = $options['url'];
$node = $robot->scrape($url);
$dump = $node->contents;
unset($node->contents);
print $dump;
var_dump($node);
コード例 #3
0
    require_capability('moodle/course:update', $coursecontext);
    $PAGE->set_context($coursecontext);
    $PAGE->set_url($navurl);
    $PAGE->set_pagelayout('admin');
    $PAGE->set_title(get_string($report, 'local_linkchecker_robot'));
    $sqlfilter = ' AND c.id = ' . $courseid;
} else {
    // If no course then this is an admin only report.
    require_capability('moodle/site:config', context_system::instance());
    admin_externalpage_setup('local_linkchecker_robot_' . $report);
}
echo $OUTPUT->header();
require 'tabs.php';
echo $tabs;
if ($retryid) {
    $robot = new \local_linkchecker_robot\robot\crawler();
    $robot->reset_for_recrawl($retryid);
}
if ($report == 'broken') {
    $sql = " FROM {linkchecker_url}  b\n       LEFT JOIN {linkchecker_edge} l ON l.b = b.id\n       LEFT JOIN {linkchecker_url}  a ON l.a = a.id\n       LEFT JOIN {course} c ON c.id = a.courseid\n           WHERE b.httpcode != ? {$sqlfilter}";
    $opts = array('200');
    $data = $DB->get_records_sql("SELECT concat(b.id, '-', l.id, '-', a.id) AS id,\n                                          b.url target,\n                                          b.httpcode,\n                                          b.httpmsg,\n                                          b.lastcrawled,\n                                          b.id AS toid,\n                                          l.id linkid,\n                                          l.text,\n                                          a.url,\n                                          a.title,\n                                          a.redirect,\n                                          a.courseid,\n                                          c.shortname {$sql}\n                                 ORDER BY httpcode DESC,\n                                          c.shortname ASC", $opts, $start, $perpage);
    $count = $DB->get_field_sql("SELECT count(*) AS count" . $sql, $opts);
    $mdlw = strlen($CFG->wwwroot);
    $table = new html_table();
    $table->head = array('', get_string('lastcrawledtime', 'local_linkchecker_robot'), get_string('response', 'local_linkchecker_robot'), get_string('broken', 'local_linkchecker_robot'), get_string('frompage', 'local_linkchecker_robot'));
    if (!$courseid) {
        array_push($table->head, get_string('course', 'local_linkchecker_robot'));
    }
    $table->data = array();
    foreach ($data as $row) {
コード例 #4
0
// along with Moodle.  If not, see <http://www.gnu.org/licenses/>.
/**
 * A link checker robot
 *
 * @package    local_linkchecker_robot
 * @copyright  2016 Brendan Heywood <*****@*****.**>
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
 */
require_once dirname(dirname(dirname(__FILE__))) . '/config.php';
require_once $CFG->libdir . '/adminlib.php';
require_login();
require_capability('moodle/site:config', context_system::instance());
admin_externalpage_setup('local_linkchecker_robot_status');
echo $OUTPUT->header();
$action = optional_param('action', '', PARAM_ALPHANUMEXT);
$robot = new \local_linkchecker_robot\robot\crawler();
$config = $robot::get_config();
if ($action == 'makebot') {
    $botuser = $robot->auto_create_bot();
}
$crawlstart = $config->crawlstart;
$crawlend = $config->crawlend;
$crawltick = $config->crawltick;
$boterror = $robot->is_bot_valid();
$queuesize = $robot->get_queue_size();
$recent = $robot->get_processed();
$numlinks = $robot->get_num_links();
$oldqueuesize = $robot->get_old_queue_size();
$numurlsbroken = $robot->get_num_broken_urls();
$numpageswithurlsbroken = $robot->get_pages_withbroken_links();
$oversize = $robot->get_num_oversize();
 * @copyright  Catalyst IT
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
 */
define('CLI_SCRIPT', true);
require dirname(dirname(dirname(dirname(__FILE__)))) . '/config.php';
require_once $CFG->libdir . '/clilib.php';
require_once $CFG->dirroot . '/local/linkchecker_robot/lib.php';
list($options, $unrecognized) = cli_get_params(array('help' => false, 'url' => null), array('h' => 'help', 'u' => 'url'));
if ($unrecognized) {
    $unrecognized = implode("\n  ", $unrecognized);
    cli_error(get_string('cliunknowoption', 'admin', $unrecognized));
}
$help = "Crawl a url as the robot and parse it.\n\nUseful for when a page has been corrected and you want to instantly reflect this.\n\nOptions:\n-h, --help      Print out this help\n-u, --url       Url to crawl and process\n\nExample:\n\$sudo -u www-data php crawl-as.php --url=http://ford.com/\n";
if ($options['help']) {
    echo $help;
    die;
}
$robot = new \local_linkchecker_robot\robot\crawler();
$error = $robot->is_bot_valid();
if ($error) {
    print "Error: {$error}";
    exit;
}
if (!$options['url']) {
    echo $help;
    die;
}
$url = $options['url'];
$node = $robot->mark_for_crawl($url, $url);
$res = $robot->scrape($url);
$result = $robot->crawl($node, 2);