crawl_page PHP代码示例

示例#1

0

显示文件

文件： basic_depth.php 项目： sarangpatel/jobscrapperv2

function crawl_page($url, $depth = 5)
{
    //echo  $url ."     :  level :  ".$depth. '<br />';
    static $seen = array();
    static $final_urls = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }
    $seen[$url] = true;
    $dom = new DOMDocument();
    //@$dom->loadHTMLFile($url);
    $dt = get_url($url);
    libxml_use_internal_errors(true);
    $dom->loadHTML($dt);
    libxml_use_internal_errors(false);
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
        //echo 'parsed  ' . $href. '<br />';
        if (strpos($href, '#') !== FALSE || $href == "/") {
            continue;
        }
        if (0 !== strpos($href, 'http')) {
            if (strpos($href, '://') !== FALSE) {
                continue;
            }
            //echo $href. '<br />SSS';
            $path = '/' . ltrim($href, '/');
            $parts = parse_url($url);
            $href = $parts['scheme'] . '://';
            if (isset($parts['user']) && isset($parts['pass'])) {
                $href .= $parts['user'] . ':' . $parts['pass'] . '@';
            }
            $href .= $parts['host'];
            if (isset($parts['port'])) {
                $href .= ':' . $parts['port'];
            }
            $href .= $path;
        }
        $original_domain = parse_url($url);
        $extracted_domain = parse_url($href);
        if (strtolower($original_domain['host']) != strtolower($extracted_domain['host'])) {
            continue;
        }
        $nodes = $element->childNodes;
        foreach ($nodes as $node) {
            if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') {
                $title = strip_tags(trim($node->nodeValue . ''));
            }
        }
        $hrefs[$url][$href] = $title;
        crawl_page($href, $depth - 1);
    }
    //echo '<PRE>';  print_r($hrefs);
    saveUrls($hrefs);
}

示例#2

0

显示文件

文件： index1.php 项目： sarangpatel/jobscrapperv2

function crawl_page($url, $depth = 1)
{
    $csv_file = $_POST['csv_file'];
    static $seen = array();
    global $hrefs;
    $output = array();
    ob_flush();
    flush();
    if ($depth === 0) {
        //return "a";
        return $hrefs;
    }
    echo 'Scanned URL : ' . $url . '<br />';
    $seen[$url] = true;
    $dom = new DOMDocument();
    @$dom->loadHTML(get_url($url));
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $str = '';
        $href = $element->getAttribute('href');
        $job_title_file = fopen('job_titles.csv', 'r');
        $nodes = $element->childNodes;
        foreach ($nodes as $node) {
            if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') {
                $o_job_title = '';
                while ($row = fgetcsv($job_title_file)) {
                    //$match	= strripos( $node->nodeValue,$row[0]);
                    //if($match !== false){
                    if (preg_match("/\\b" . $row[0] . "\\b/i", $node->nodeValue)) {
                        //echo $href . '==='. $node->nodeValue . '===' .  $row[0] . '<br />';
                        $o_job_title = $node->nodeValue . '';
                        fclose($job_title_file);
                        break;
                    }
                }
                if (!empty($o_job_title)) {
                    $output[] = array($o_job_title, $href);
                }
                $hrefs[$url][] = $href;
                crawl_page($href, $depth - 1);
                //$str .= $node->nodeName . ' : '. $node->nodeValue. ",";
            }
        }
    }
}

示例#3

0

显示文件

文件： hunger.php 项目： Carlinho89/Mensa_service_TUM

function pdfToString()
{
    $links = crawl_page("http://www.betriebsrestaurant-gmbh.de/index.php?id=91");
    $pdfLink = "";
    foreach ($links as $file) {
        if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE) {
            $weekNumber = date("W");
            if ($weekNumber === substr($file, 16, 2)) {
                // current link is MI pdf
                $pdfLink = "http://www.betriebsrestaurant-gmbh.de/" . $file;
            }
        }
    }
    // Parse pdf file and build necessary objects.
    $parser = new \Smalot\PdfParser\Parser();
    $pdf = $parser->parseFile($pdfLink);
    $text = $pdf->getText();
    return $text;
}

示例#4

0

显示文件

文件： test.php 项目： Beertie/web_scraper

function crawl_page($url, $depth = 5)
{
    $b = 0;
    $c = 0;
    $d = 0;
    $a = 0;
    static $seen = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }
    $seen[$url] = true;
    $dom = new DOMDocument('1.0');
    @$dom->loadHTMLFile($url);
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
        if (0 !== strpos($href, 'http')) {
            $path = '/' . ltrim($href, '/');
            if (extension_loaded('http')) {
                $href = http_build_url($url, array('path' => $path));
            } else {
                $parts = parse_url($url);
                $href = $parts['scheme'] . '://';
                if (isset($parts['user']) && isset($parts['pass'])) {
                    $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    $d++;
                }
                $href .= $parts['host'];
                if (isset($parts['port'])) {
                    $href .= ':' . $parts['port'];
                    $a++;
                }
                $href .= $path;
            }
            $c++;
        }
        crawl_page($href, $depth - 1);
        $b++;
    }
    echo "B:{$b} C: {$c} D: {$d} URL:" . $url . "\n";
}

示例#5

0

显示文件

文件： test1.php 项目： Beertie/web_scraper

function crawl_page($url, $depth = 5)
{
    $seen = array();
    if ($depth == 0 or in_array($url, $seen)) {
        return;
    }
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_TIMEOUT, 30);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10');
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $result = curl_exec($ch);
    curl_close($ch);
    if ($result) {
        $stripped_file = strip_tags($result, "<a>");
        preg_match_all("/<a[\\s]+[^>]*?href[\\s]?=[\\s\"\\']+" . "(.*?)[\"\\']+.*?>" . "([^<]+|.*?)?<\\/a>/", $stripped_file, $matches, PREG_SET_ORDER);
        foreach ($matches as $match) {
            $href = $match[1];
            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');
                if (extension_loaded('http')) {
                    $href = http_build_url($href, array('path' => $path));
                } else {
                    $parts = parse_url($href);
                    $href = $parts['scheme'] . '://';
                    if (isset($parts['user']) && isset($parts['pass'])) {
                        $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    }
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .= $path;
                }
            }
            crawl_page($href, $depth - 1);
        }
    }
    echo "Crawled {$href}\n";
}

示例#6

0

显示文件

文件： index.php 项目： sarangpatel/jobscrapperv2

<?php

session_start();
error_reporting(0);
ini_set('max_execution_time', 0);
$site_url = 'https://' . $_SERVER['HTTP_HOST'] . '/~demoserver/grouplocator/';
if ($_POST['action'] == 'scrape') {
    $url = trim($_POST['url']);
    $depth = $_POST['level_deep'];
    //$match_percent = $_POST['match_percent'];
    $time_start = microtime(true);
    $output = crawl_page($url, $depth);
    require_once 'html/output.php';
    exit;
    $time_end = microtime(true);
    $time = $time_end - $time_start;
    //echo $time . ' secs';
} else {
    require_once 'html/home.php';
}
function crawl_page($url, $depth = 1)
{
    //echo '<PRE>';
    //if (ob_get_level() == 0) ob_start();
    require_once 'vendor/autoload.php';
    // Initiate crawl
    $crawler = new \Arachnid\Crawler($url, $depth);
    $crawler->traverse();
    // Get link data
    $links = $crawler->getLinks();
    //print_r($links);

示例#7

0

显示文件

文件： index.php 项目： sarangpatel/jobscrapperv2

<?php

session_start();
error_reporting(0);
ini_set('max_execution_time', 0);
$site_url = 'https://' . $_SERVER['HTTP_HOST'] . '/~demoserver/grouplocator/';
if ($_POST['action'] == 'scrape') {
    $url = trim($_POST['url']);
    $_POST['csv_file'] = "job_titles.csv";
    //$match_percent = $_POST['match_percent'];
    $time_start = microtime(true);
    $output = crawl_page($url, 1);
    require_once 'html/output.php';
    exit;
    $time_end = microtime(true);
    $time = $time_end - $time_start;
    //echo $time . ' secs';
} else {
    require_once 'html/home.php';
}
function crawl_page($url, $depth = 1)
{
    $csv_file = $_POST['csv_file'];
    //if (ob_get_level() == 0) ob_start();
    static $seen = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }
    //echo 'Scanned URL : ' . $url. '<br /><br /><br />';
    $seen[$url] = true;
    $output = array();

示例#8

0

显示文件

文件： functions.php 项目： Scraps-Git/PingPong

function crawl_page($url, $depth = 5)
{
    // crawls a page for links
    $seen = array();
    if ($depth == 0 or in_array($url, $seen)) {
        return;
    }
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_TIMEOUT, 30);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    $result = curl_exec($ch);
    curl_close($ch);
    if ($result) {
        $stripped_file = strip_tags($result, "<a>");
        preg_match_all("/<a[\\s]+[^>]*?href[\\s]?=[\\s\"\\']+" . "(.*?)[\"\\']+.*?>" . "([^<]+|.*?)?<\\/a>/", $stripped_file, $matches, PREG_SET_ORDER);
        foreach ($matches as $match) {
            $href = $match[1];
            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');
                if (extension_loaded('http')) {
                    $href = http_build_url($url, array('path' => $path));
                } else {
                    $parts = parse_url($url);
                    $href = $parts['scheme'] . '://';
                    if (isset($parts['user']) && isset($parts['pass'])) {
                        $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    }
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .= $path;
                }
            }
            crawl_page($href, $depth - 1);
        }
    }
    echo "Crawled {$href}" . " <br>";
}

示例#9

0

显示文件

文件： index.php 项目： abhisheksharma14/amazon_book_crawler

            $price[$i] = $matches[0][0];
            // price list of books
            $dis = substr($dis, 0, strpos($dis, $matches[0][0]));
            $auth = substr($dis, 3, strlen($dis) - 16);
            // auther of book
            $author[$i] = $auth;
            // Author List
        }
    }
    return array($nameList, $author, $pubDate, $isbnNumbers, $price, $links);
}
if (isset($_GET['query'])) {
    $query = $_GET['query'];
    $query = preg_replace('/\\s+/', ' ', $query);
    $query = str_replace(" ", "+", $query);
    $books = crawl_page($query);
    $col = sizeof($books);
    // number of rows
    $row = sizeof($books[0]);
    // number of columns
} else {
    $books = 0;
}
?>

<!DOCTYPE html>
<html>
<head>
    <title>Search Results (Amazon)</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet">
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>

示例#10

0

显示文件

文件： update_basic_scraper.php 项目： sarangpatel/jobscrapperv2

function crawl_page($url, $depth = 5)
{
    echo $depth . '<br />';
    static $seen = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }
    $seen[$url] = true;
    $dom = new DOMDocument();
    //@$dom->loadHTMLFile($url);
    $dt = get_url($url);
    @$dom->loadHTML($dt);
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
        if (strpos($href, '#') !== FALSE || $href == "/") {
            continue;
        }
        if (0 !== strpos($href, 'http')) {
            echo $href . '<br />SSS';
            $path = '/' . ltrim($href, '/');
            $parts = parse_url($url);
            $href = $parts['scheme'] . '://';
            if (isset($parts['user']) && isset($parts['pass'])) {
                $href .= $parts['user'] . ':' . $parts['pass'] . '@';
            }
            $href .= $parts['host'];
            if (isset($parts['port'])) {
                $href .= ':' . $parts['port'];
            }
            $href .= $path;
        } else {
            echo 'includehttp : ' . $href . ' <br />';
        }
        $hrefs[$url][] = $href;
    }
    /*   foreach ($anchors as $element) {
            $href = $element->getAttribute('href');
            if (0 !== strpos($href, 'http')) {
                $path = '/' . ltrim($href, '/');
                if (extension_loaded('http')) {
                    $href = http_build_url($url, array('path' => $path));
                } else {
                    $parts = parse_url($url);
                    $href = $parts['scheme'] . '://';
                    if (isset($parts['user']) && isset($parts['pass'])) {
                        $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                    }
                    $href .= $parts['host'];
                    if (isset($parts['port'])) {
                        $href .= ':' . $parts['port'];
                    }
                    $href .= $path;
    	
                }
            }
    		}*/
    echo '<PRE>';
    print_r($hrefs);
    crawl_page($hrefs[$url][$depth - 1], $depth - 1);
    //echo "URL:",$url,PHP_EOL,"CONTENT:",PHP_EOL,$dom->saveHTML(),PHP_EOL,PHP_EOL;
    //echo "URL:" .$url . '<br />';
    /*foreach($hrefs as $u => $hrfs){
    		echo "$u : <br />";
    		foreach ($hrfs[0] as $element) {
    	        $hrf = $element->getAttribute('href');
    			echo $href. '<br />';
    		}
    		ob_flush();flush();
    	
    	}*/
}

示例#11

0

显示文件

文件： index.php 项目： Kiwisoju/Pungent

</p> </div>
      <?php 
} elseif ($_GET['pun'] == 'yes') {
    ?>
        <div class="well pop-up"><p>Pun post successful!</p> </div>
      <?php 
} elseif ($_GET['pun'] == 'no') {
    ?>
        <div class="well pop-up"><p>Pun post un-successful</p> </div>
      <?php 
}
?>
      <div class="pun-of-the-day">
        <h2>Pun of the day</h2>
        <p><?php 
echo crawl_page("http://www.punoftheday.com/");
?>
</p>
      </div>
      <hr class="hr-fade">
      <div class="topic challenge">
        <h2><a href="admin.php?action=topic">Topic Challenge #<?php 
echo $databaseQueries->getChallenge("topic_challenge")['topic_id'];
?>
<br><?php 
echo $databaseQueries->getChallenge("topic_challenge")['topic'];
?>
</a></h2>
        <?php 
$databaseQueries->getCurrentPuns(3, 'topic');
?>

示例#12

0

显示文件

文件： originalCrawl.php 项目： JosepRivaille/StyleCombo

<?php

if ($depth > 0) {
    $html = file_get_contents($url);
    preg_match_all('~<a.*?href="(.*?)".*?>~', $html, $matches);
    foreach ($matches[1] as $newurl) {
        crawl_page($newurl, $depth - 1);
    }
    file_put_contents('results.txt', $newurl . "\n\n" . $html . "\n\n", FILE_APPEND);
}

示例#13

0

显示文件

文件： index_old.php 项目： sarangpatel/jobscrapperv2

function crawl_page($url, $depth = 1)
{
    $csv_file = $_POST['csv_file'];
    //if (ob_get_level() == 0) ob_start();
    static $seen = array();
    if (isset($seen[$url]) || $depth === 0) {
        return;
    }
    //echo 'Scanned URL : ' . $url. '<br /><br /><br />';
    $seen[$url] = true;
    $output = array();
    $dom = new DOMDocument('1.0');
    @$dom->loadHTMLFile($url);
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $str = '';
        $href = $element->getAttribute('href');
        //if(validURL($href)){
        $job_title_file = fopen("{$csv_file}" . ".csv", 'r');
        //$job_title_file = fopen('job_titles.csv', 'r');
        //echo $href . '<br /><br />';
        $nodes = $element->childNodes;
        //$str .= $href . " : ";
        foreach ($nodes as $node) {
            if (!empty($node->nodeValue) && trim($node->nodeValue) != '' && $node->nodeName != 'img') {
                $o_job_title = '';
                while ($row = fgetcsv($job_title_file)) {
                    //echo   $row[0] . '===' . $node->nodeValue . '<br />';
                    //$match	= strripos( $node->nodeValue,$row[0]);
                    //if($match !== false){
                    if (preg_match("/\\b" . $row[0] . "\\b/i", $node->nodeValue)) {
                        //echo $href . '==='. $node->nodeValue . '===' .  $row[0] . '<br />';
                        $o_job_title = $node->nodeValue . '';
                        fclose($job_title_file);
                        break;
                    }
                }
                if (!empty($o_job_title)) {
                    $output[] = array($o_job_title, $href);
                }
                //$str .= $node->nodeName . ' : '. $node->nodeValue. ",";
                break;
            }
        }
        //}validURL
        //$output[]=array($href,$element->nodeValue);
        //echo $element->nodeValue . " : " . $href . "<br />";                       // Output content
        //ob_flush();
        //flush();
        //sleep(2);
        if (0 !== strpos($href, 'http')) {
            $path = '/' . ltrim($href, '/');
            if (extension_loaded('http')) {
                $href = http_build_url($url, array('path' => $path));
            } else {
                $parts = parse_url($url);
                $href = $parts['scheme'] . '://';
                if (isset($parts['user']) && isset($parts['pass'])) {
                    $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                }
                $href .= $parts['host'];
                if (isset($parts['port'])) {
                    $href .= ':' . $parts['port'];
                }
                $href .= $path;
            }
        }
        crawl_page($href, $depth - 1);
    }
    //ob_end_flush();
    return array($seen, $output);
}

示例#14

-1

显示文件

文件： index.php 项目： kordianbruck/TUM.sexy

function pdfToString()
{
    $weekNumber = date('W');
    //Check if we have the current week in cache
    $text = apc_fetch('hungertext' . $weekNumber);
    if ($text !== false) {
        return $text;
    }
    //Otherwise fetch all links
    $links = crawl_page(URL_PAGE_WITH_LINKS);
    $pdfLink = '';
    foreach ($links as $file) {
        if (strpos(strtolower($file), '.pdf') !== FALSE && strpos($file, '_FMI_') !== FALSE && $weekNumber === substr($file, 16, 2)) {
            $pdfLink = URL_MAIN . $file;
        }
    }
    //Don't proceed when no link was found
    if (empty($pdfLink)) {
        return;
    }
    // Parse pdf file and build necessary objects.
    $parser = new \Smalot\PdfParser\Parser();
    $pdf = $parser->parseFile($pdfLink);
    $text = $pdf->getText();
    //Store it in cache
    apc_store('hungertext' . $weekNumber, $text, 2 * 24 * 3600);
    //return it
    return $text;
}

PHP crawl_page示例