/
crawl.php
109 lines (101 loc) · 4 KB
/
crawl.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/php -q
<?php
include dirname(__FILE__) . '/mi_crawler.php';
function crawlHelp() {
?>
crawl - a testing tool for crawling a website
Usage: crawl [OPTION] .... Uri
If you're doing any kind of serverside browser sniffing this tool helps you find
out what the user sees. Also serves to measure (individual page) response times
to help focus attention where it's needed, and stores each page for further/later
analysis
arguments:
-cache Use cached results if they exist? Defaults to true
-depth How deep to follow the links. Defaults to 2, set to 0 to disable
-exclude a partial regex of urls to exclude defaults to "(/css/|/js/)"
-limit Maximum number of pages to request - defaults to no limit
-restricttodomain Don't leave the domain? Defaults to true
-restricttodomainstrict Don't consider subdomains part of the domain? Defaults to false
-no-parent Don't go above the start url - defaults to true
-loglevel How verbose/informative to be. Defaults to 2
-continue Continue from where you left off? Defaults to false
If true and an index from a previous crawl is found no
crawling will be performed.
This setting is relevant if you've already crawled a site
and want to process the results - set to true
-useragent The user agent string to use - defaults to
"MiCrawler Version X.X"
Can either be a full string, or one of the existing presets:
android, chrome, googlebot, firefox, ie6, ie7, ie8,
lynx, opera
-processor The name of the processor class to run results through.
Defaults to the (normalized) name of the domain
If the class doesn't exist it only crawls
-wait How many seconds to wait inbetween requests. Defaults to 0
<?php
}
function parseParams ($params = array()) {
$function = array_shift($params);
$return = array();
$count = count($params);
for ($i = 0; $i < $count; $i++) {
if (isset($params[$i])) {
if ($params[$i]{0} === '-') {
$key = substr($params[$i], 1);
$return[$key] = true;
unset($params[$i]);
if (isset($params[++$i])) {
if ($params[$i]{0} !== '-') {
$return[$key] = str_replace('"', '', $params[$i]);
unset($params[$i]);
}
}
} else {
$return[] = $params[$i];
}
}
}
return array($function, $return);
}
function crawl($uri, $params = array(), $processor = '', $processorFile = '') {
$params = array_merge(array('continue' => false), $params);
if (!strpos($uri, '://')) {
$uri = 'http://' . $uri;
}
$results = MiCrawler::crawl($uri, $params);
if ($results && $processorFile && is_file(dirname(__FILE__) . '/' . $processorFile)) {
require_once dirname(__FILE__) . '/' . $processorFile;
$processor = ucwords($processor) . 'Processor';
$results = call_user_func_array(array($processor, 'process'), array($results, $uri));
echo "\n";
foreach((array)$results as $key => $values) {
echo $key . ' ' . count($values) . " found\n";
}
return;
}
echo count($results) . " found\n";
}
list($function, $params) = parseParams($argv);
if ($function === 'crawl.php') {
if (empty($params[0]) && empty($params['uri'])) {
return crawlHelp();
}
$processor = $processorFile = null;
$uri = $params[0];
if (!strpos($uri, '://')) {
$uri = 'http://' . $uri;
}
extract($params);
if (!$processor) {
$parts = parse_url($uri);
$processor = strtolower(str_replace(array('www.', '.com'), '', $parts['host']));
$processorFile = $processor . '_processor.php';
if (!file_exists(dirname(__FILE__) . '/' . $processor . '_processor.php')) {
$processor = 'Generic';
$processorFile = 'generic_processor.php';
}
} elseif (!$processorFile) {
$processorFile = $processor . '_processor.php';
}
crawl($uri, $params, $processor, $processorFile);
}