/
parser.php
83 lines (70 loc) · 2.59 KB
/
parser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
<?php
class HTMLParser
{
public $htmlurl = "";
public $pageHeader = "";
public $imageHolder = array();
public $bodyHolder = array();
public function __construct($url)
{
$this->htmlurl = $url; //if we need to get the url sometime.
$string = file_get_contents($url);
$doc = new DOMDocument();
$doc->loadHTML('<?xml encoding="UTF-8">' . $string);
$elements = array('h1', 'img', 'p'); //tags we are searching for.
foreach ($elements as $element) {
$imageId = 0;
$bodyId = 0;
$domelements = $doc->getElementsByTagName($element);
foreach ($domelements as $domelement) {
if ($element == 'h1') {
$this->pageHeader = "<h1>" . $domelement->nodeValue . "</h1> <br>";
}
if ($element == 'img') {
$urlImg = $domelement->getAttribute('src');
$pattern = '/http.*/'; //regex to take the correct jpg image url.
$img_formats = array("jpg", "jpeg", "gif", "tiff"); //Not taking the png imaging because they are usually the logs.
preg_match($pattern, $urlImg, $url);
$urlPath = $url[0];
$path_info = pathinfo($urlPath);
if (in_array(strtolower($path_info['extension']), $img_formats)) {
// echo "$urlPath"."<br>";
$this->imageHolder[$imageId] = "<img src=\"" . $urlPath . "\">" . "</br>";
$imageId += 1;
// echo "<img src=\"". $urlPath ."\">"."</n>";
}
}
if ($element == 'p') {
$this->bodyHolder[$bodyId] = $domelement->nodeValue . "<br>";
$bodyId += 1;
}
}
}
}
public function getHeader()
{
return $this->pageHeader;
}
public function getHtmlUrl()
{
return $this->htmlurl;
}
public function getImageHolder()
{
return $this->imageHolder;
}
public function getBodyHolder()
{
return $this->bodyHolder;
}
}
$htmlParser = new HTMLParser($_POST["cnnurl"]);
// echo $object->getHtmlUrl();
echo $htmlParser->getHeader();
$imagePlaceHolder = $htmlParser->getImageHolder();
echo $imagePlaceHolder[0]; // Saw that only the first position image is relavent in most of the articles. Hence using only 0 instead of the for loop.
$bodyHolder = $htmlParser->getBodyHolder();
foreach ($bodyHolder as $content) {
echo $content;
}
?>