function crawl_page($url, $depth = 5) { //echo $url ." : level : ".$depth. '<br />'; static $seen = array(); static $final_urls = array(); if (isset($seen[$url]) || $depth === 0) { return; } $seen[$url] = true; $dom = new DOMDocument(); //@$dom->loadHTMLFile($url); $dt = get_url($url); libxml_use_internal_errors(true); $dom->loadHTML($dt); libxml_use_internal_errors(false); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); //echo 'parsed ' . $href. '<br />'; if (strpos($href, '#') !== FALSE || $href == "/") { continue; } if (0 !== strpos($href, 'http')) { if (strpos($href, '://') !== FALSE) { continue; } //echo $href. '<br />SSS'; $path = '/' . ltrim($href, '/'); $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } $original_domain = parse_url($url); $extracted_domain = parse_url($href); if (strtolower($original_domain['host']) != strtolower($extracted_domain['host'])) { continue; } $nodes = $element->childNodes; foreach ($nodes as $node) { if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') { $title = strip_tags(trim($node->nodeValue . '')); } } $hrefs[$url][$href] = $title; crawl_page($href, $depth - 1); } //echo '<PRE>'; print_r($hrefs); saveUrls($hrefs); }
function crawl_page($url, $depth = 1) { $csv_file = $_POST['csv_file']; static $seen = array(); global $hrefs; $output = array(); ob_flush(); flush(); if ($depth === 0) { //return "a"; return $hrefs; } echo 'Scanned URL : ' . $url . '<br />'; $seen[$url] = true; $dom = new DOMDocument(); @$dom->loadHTML(get_url($url)); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $str = ''; $href = $element->getAttribute('href'); $job_title_file = fopen('job_titles.csv', 'r'); $nodes = $element->childNodes; foreach ($nodes as $node) { if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') { $o_job_title = ''; while ($row = fgetcsv($job_title_file)) { //$match = strripos( $node->nodeValue,$row[0]); //if($match !== false){ if (preg_match("/\\b" . $row[0] . "\\b/i", $node->nodeValue)) { //echo $href . '==='. $node->nodeValue . '===' . $row[0] . '<br />'; $o_job_title = $node->nodeValue . ''; fclose($job_title_file); break; } } if (!empty($o_job_title)) { $output[] = array($o_job_title, $href); } $hrefs[$url][] = $href; crawl_page($href, $depth - 1); //$str .= $node->nodeName . ' : '. $node->nodeValue. ","; } } } }
function pdfToString() { $links = crawl_page("http://www.betriebsrestaurant-gmbh.de/index.php?id=91"); $pdfLink = ""; foreach ($links as $file) { if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE) { $weekNumber = date("W"); if ($weekNumber === substr($file, 16, 2)) { // current link is MI pdf $pdfLink = "http://www.betriebsrestaurant-gmbh.de/" . $file; } } } // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($pdfLink); $text = $pdf->getText(); return $text; }
function crawl_page($url, $depth = 5) { $b = 0; $c = 0; $d = 0; $a = 0; static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } $seen[$url] = true; $dom = new DOMDocument('1.0'); @$dom->loadHTMLFile($url); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; $d++; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; $a++; } $href .= $path; } $c++; } crawl_page($href, $depth - 1); $b++; } echo "B:{$b} C: {$c} D: {$d} URL:" . $url . "\n"; }
function crawl_page($url, $depth = 5) { $seen = array(); if ($depth == 0 or in_array($url, $seen)) { return; } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10'); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec($ch); curl_close($ch); if ($result) { $stripped_file = strip_tags($result, "<a>"); preg_match_all("/<a[\\s]+[^>]*?href[\\s]?=[\\s\"\\']+" . "(.*?)[\"\\']+.*?>" . "([^<]+|.*?)?<\\/a>/", $stripped_file, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $href = $match[1]; if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($href, array('path' => $path)); } else { $parts = parse_url($href); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } } echo "Crawled {$href}\n"; }
<?php session_start(); error_reporting(0); ini_set('max_execution_time', 0); $site_url = 'https://' . $_SERVER['HTTP_HOST'] . '/~demoserver/grouplocator/'; if ($_POST['action'] == 'scrape') { $url = trim($_POST['url']); $depth = $_POST['level_deep']; //$match_percent = $_POST['match_percent']; $time_start = microtime(true); $output = crawl_page($url, $depth); require_once 'html/output.php'; exit; $time_end = microtime(true); $time = $time_end - $time_start; //echo $time . ' secs'; } else { require_once 'html/home.php'; } function crawl_page($url, $depth = 1) { //echo '<PRE>'; //if (ob_get_level() == 0) ob_start(); require_once 'vendor/autoload.php'; // Initiate crawl $crawler = new \Arachnid\Crawler($url, $depth); $crawler->traverse(); // Get link data $links = $crawler->getLinks(); //print_r($links);
<?php session_start(); error_reporting(0); ini_set('max_execution_time', 0); $site_url = 'https://' . $_SERVER['HTTP_HOST'] . '/~demoserver/grouplocator/'; if ($_POST['action'] == 'scrape') { $url = trim($_POST['url']); $_POST['csv_file'] = "job_titles.csv"; //$match_percent = $_POST['match_percent']; $time_start = microtime(true); $output = crawl_page($url, 1); require_once 'html/output.php'; exit; $time_end = microtime(true); $time = $time_end - $time_start; //echo $time . ' secs'; } else { require_once 'html/home.php'; } function crawl_page($url, $depth = 1) { $csv_file = $_POST['csv_file']; //if (ob_get_level() == 0) ob_start(); static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } //echo 'Scanned URL : ' . $url. '<br /><br /><br />'; $seen[$url] = true; $output = array();
function crawl_page($url, $depth = 5) { // crawls a page for links $seen = array(); if ($depth == 0 or in_array($url, $seen)) { return; } $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $result = curl_exec($ch); curl_close($ch); if ($result) { $stripped_file = strip_tags($result, "<a>"); preg_match_all("/<a[\\s]+[^>]*?href[\\s]?=[\\s\"\\']+" . "(.*?)[\"\\']+.*?>" . "([^<]+|.*?)?<\\/a>/", $stripped_file, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $href = $match[1]; if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } } echo "Crawled {$href}" . " <br>"; }
$price[$i] = $matches[0][0]; // price list of books $dis = substr($dis, 0, strpos($dis, $matches[0][0])); $auth = substr($dis, 3, strlen($dis) - 16); // auther of book $author[$i] = $auth; // Author List } } return array($nameList, $author, $pubDate, $isbnNumbers, $price, $links); } if (isset($_GET['query'])) { $query = $_GET['query']; $query = preg_replace('/\\s+/', ' ', $query); $query = str_replace(" ", "+", $query); $books = crawl_page($query); $col = sizeof($books); // number of rows $row = sizeof($books[0]); // number of columns } else { $books = 0; } ?> <!DOCTYPE html> <html> <head> <title>Search Results (Amazon)</title> <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet"> <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
function crawl_page($url, $depth = 5) { echo $depth . '<br />'; static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } $seen[$url] = true; $dom = new DOMDocument(); //@$dom->loadHTMLFile($url); $dt = get_url($url); @$dom->loadHTML($dt); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (strpos($href, '#') !== FALSE || $href == "/") { continue; } if (0 !== strpos($href, 'http')) { echo $href . '<br />SSS'; $path = '/' . ltrim($href, '/'); $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } else { echo 'includehttp : ' . $href . ' <br />'; } $hrefs[$url][] = $href; } /* foreach ($anchors as $element) { $href = $element->getAttribute('href'); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } }*/ echo '<PRE>'; print_r($hrefs); crawl_page($hrefs[$url][$depth - 1], $depth - 1); //echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL; //echo "URL:" .$url . '<br />'; /*foreach($hrefs as $u => $hrfs){ echo "$u : <br />"; foreach ($hrfs[0] as $element) { $hrf = $element->getAttribute('href'); echo $href. '<br />'; } ob_flush();flush(); }*/ }
</p> </div> <?php } elseif ($_GET['pun'] == 'yes') { ?> <div class="well pop-up"><p>Pun post successful!</p> </div> <?php } elseif ($_GET['pun'] == 'no') { ?> <div class="well pop-up"><p>Pun post un-successful</p> </div> <?php } ?> <div class="pun-of-the-day"> <h2>Pun of the day</h2> <p><?php echo crawl_page("http://www.punoftheday.com/"); ?> </p> </div> <hr class="hr-fade"> <div class="topic challenge"> <h2><a href="admin.php?action=topic">Topic Challenge #<?php echo $databaseQueries->getChallenge("topic_challenge")['topic_id']; ?> <br><?php echo $databaseQueries->getChallenge("topic_challenge")['topic']; ?> </a></h2> <?php $databaseQueries->getCurrentPuns(3, 'topic'); ?>
<?php if ($depth > 0) { $html = file_get_contents($url); preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches); foreach ($matches[1] as $newurl) { crawl_page($newurl, $depth - 1); } file_put_contents('results.txt', $newurl . "\n\n" . $html . "\n\n", FILE_APPEND); }
function crawl_page($url, $depth = 1) { $csv_file = $_POST['csv_file']; //if (ob_get_level() == 0) ob_start(); static $seen = array(); if (isset($seen[$url]) || $depth === 0) { return; } //echo 'Scanned URL : ' . $url. '<br /><br /><br />'; $seen[$url] = true; $output = array(); $dom = new DOMDocument('1.0'); @$dom->loadHTMLFile($url); $anchors = $dom->getElementsByTagName('a'); foreach ($anchors as $element) { $str = ''; $href = $element->getAttribute('href'); //if(validURL($href)){ $job_title_file = fopen("{$csv_file}" . ".csv", 'r'); //$job_title_file = fopen('job_titles.csv', 'r'); //echo $href . '<br /><br />'; $nodes = $element->childNodes; //$str .= $href . " : "; foreach ($nodes as $node) { if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') { $o_job_title = ''; while ($row = fgetcsv($job_title_file)) { //echo $row[0] . '===' . $node->nodeValue . '<br />'; //$match = strripos( $node->nodeValue,$row[0]); //if($match !== false){ if (preg_match("/\\b" . $row[0] . "\\b/i", $node->nodeValue)) { //echo $href . '==='. $node->nodeValue . '===' . $row[0] . '<br />'; $o_job_title = $node->nodeValue . ''; fclose($job_title_file); break; } } if (!empty($o_job_title)) { $output[] = array($o_job_title, $href); } //$str .= $node->nodeName . ' : '. $node->nodeValue. ","; break; } } //}validURL //$output[]=array($href,$element->nodeValue); //echo $element->nodeValue . " : " . $href . "<br />"; // Output content //ob_flush(); //flush(); //sleep(2); if (0 !== strpos($href, 'http')) { $path = '/' . ltrim($href, '/'); if (extension_loaded('http')) { $href = http_build_url($url, array('path' => $path)); } else { $parts = parse_url($url); $href = $parts['scheme'] . '://'; if (isset($parts['user']) && isset($parts['pass'])) { $href .= $parts['user'] . ':' . $parts['pass'] . '@'; } $href .= $parts['host']; if (isset($parts['port'])) { $href .= ':' . $parts['port']; } $href .= $path; } } crawl_page($href, $depth - 1); } //ob_end_flush(); return array($seen, $output); }
function pdfToString() { $weekNumber = date('W'); //Check if we have the current week in cache $text = apc_fetch('hungertext' . $weekNumber); if ($text !== false) { return $text; } //Otherwise fetch all links $links = crawl_page(URL_PAGE_WITH_LINKS); $pdfLink = ''; foreach ($links as $file) { if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE && $weekNumber === substr($file, 16, 2)) { $pdfLink = URL_MAIN . $file; } } //Don't proceed when no link was found if (empty($pdfLink)) { return; } // Parse pdf file and build necessary objects. $parser = new \Smalot\PdfParser\Parser(); $pdf = $parser->parseFile($pdfLink); $text = $pdf->getText(); //Store it in cache apc_store('hungertext' . $weekNumber, $text, 2 * 24 * 3600); //return it return $text; }