-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.php
36 lines (28 loc) · 869 Bytes
/
crawler.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
<?php
$url=$_POST['url'];
recurrsive($url);
function recurrsive($http_url)
{
$eurl=str_replace("https:","http:","$http_url");
$page = new DOMDocument();
$url=html_entity_decode($eurl);
@$page->loadHTMLFile($url);
$pagestring=$page->saveHTML();
$content2 = getTextBetweenTags($pagestring,'title');
echo $url;
echo "$content2<br/>";
foreach ($page->getElementsByTagName('a') as $links) {
$original_url = $links->getAttribute('href');
//echo "$original_url";
//echo "$content2<br/>";
recurrsive($original_url);
}
}
function getTextBetweenTags($string, $tagname) // this is function to get text between tags!!
{
$pattern = "/<$tagname>((.|\n)*?)<\/$tagname>/";
// $pattern = "/<$tagname>(.*?)<\/$tagname>/";
preg_match($pattern, $string, $matches);
return $matches[1];
}
?>