Esempio n. 1
1
// Assigning the URL we want to scrape to the variable $url
$pages = 1;
$page = 1;
// While $continue is TRUE, i.e. there are more search results pages
while ($continue == TRUE) {
    echo "page " . $page++ . "   ";
    $results_page = curl($url);
    // Downloading the results page using our curl() function
    $results_page = scrape_between($results_page, "<div class=\"col-sm-9\">", "</main>");
    // Scraping out only the middle section of the results page that contains our results
    $separate_results = explode("<a class=\"news-release\" title=\"", $results_page);
    // Expploding the results into separate parts into an array
    // For each separate result, scrape the URL
    foreach ($separate_results as $separate_result) {
        if ($separate_result != "") {
            $results_urls = scrape_between($separate_result, "href=\"", ".html\">") . ".html";
            $results_urls = (string) $results_urls;
        }
        $servername = "localhost";
        $username = "******";
        $password = "";
        $dbname = "prnewswire";
        try {
            $conn = new PDO("mysql:host={$servername};dbname={$dbname}", $username, $password);
            // set the PDO error mode to exception
            $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
            $sql = "INSERT INTO urls (url)\n                VALUES ('{$results_urls}')";
            // use exec() because no results are returned
            $conn->exec($sql);
            echo "New record created successfully" . "\n";
        } catch (PDOException $e) {
Esempio n. 2
0
function geturl($query)
{
    $query = preg_replace('/\\ /', "+", $query);
    $data = curl("http://www.goodreads.com/search?utf8=%E2%9C%93&query=" . $query);
    $data = scrape_between($data, "class=\"tableList", "/table");
    if ($data != "") {
        $doc = new DOMDocument();
        $doc->loadHTML($data);
        $div = $doc->getElementsByTagName("a");
        $mytext = $div->item(1)->getAttribute("href");
        return $mytext;
    } else {
        return "booknotfound";
    }
}
function check_e_outage()
{
    $url = "https://www.washington.edu/cac/outages";
    $options = array(CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_AUTOREFERER => TRUE, CURLOPT_CONNECTTIMEOUT => 120, CURLOPT_TIMEOUT => 120, CURLOPT_MAXREDIRS => 10, CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", CURLOPT_URL => $url);
    $ch = curl_init();
    // Initialising cURL
    curl_setopt_array($ch, $options);
    // Setting cURL's options using the previously assigned array data in $options
    $data = curl_exec($ch);
    // Executing the cURL request and assigning the returned data to the $data variable
    curl_close($ch);
    // Closing cURL
    $status = scrape_between($data, "<div class=\"status\">", "</body>");
    if (strpos($data, "not operating normally")) {
        return $status;
    } else {
        return false;
    }
}
Esempio n. 4
0
$sources = [["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 1], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][2]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][2]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][3]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][3]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][4]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][4]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][5]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][5]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 4], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][6]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][6]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 6], ["url" => "https://es.wikipedia.org/wiki/Anexo:Señales_de_tráfico_de_peligro_de_España", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 3], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][1]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][1]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][2]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][2]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][3]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][3]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true]];
foreach ($sources as $source) {
    $html = file_get_contents($source['url']);
    $dom = new DOMDocument();
    @$dom->loadHtml($html);
    $xpath = new DOMXPath($dom);
    $classes = array();
    $texts = $xpath->query($source['text_query']);
    $images = $xpath->query($source['image_query']);
    for ($i = 0; $i < $images->length; $i++) {
        if (isset($source['is_table']) && $source['is_table']) {
            $sign_id = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<br>', '</b>'))));
            $sign_text = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<b>', '<br>'))));
        } else {
            $sign_id = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<b>', '<br>'))));
            $sign_text = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<br>', '</center>'))));
        }
        $sign_id = str_replace(' ', '', $sign_id);
        if (empty($sign_id)) {
            $sign_id = NULL;
        }
        $image = "https:" . $images->item($i)->nodeValue;
        $classes[$i]['spain_id'] = $sign_id;
        $classes[$i]['text'] = $sign_text;
        $classes[$i]['image'] = $image;
    }
    saveClasses($classes, $source['subclass']);
}
function saveClasses($classes, $subclass)
{
    foreach ($classes as $class) {
<?php

// Defining the basic cURL function
function curl($url)
{
    $ch = curl_init();
    // Initialising cURL
    curl_setopt($ch, CURLOPT_URL, $url);
    // Setting cURL's URL option with the $url variable passed into the function
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
    // Setting cURL's option to return the webpage data
    $data = curl_exec($ch);
    // Executing the cURL request and assigning the returned data to the $data variable
    curl_close($ch);
    // Closing cURL
    return $data;
    // Returning the data from the function
}
$scraped_website = curl("http://www.snapdeal.com/product/intex-10000-mah-itpb11-power/1403089232");
$scraped_data = scrape_between($scraped_page, "payBlkBig", "</span>");
// Scraping downloaded dara in $scraped_page for content between <title> and </title> tags
echo $scraped_data;
// Echoing $scraped data, should show "The Internet Movie Database (IMDb)"
Esempio n. 6
0
 $results_date = date("h:i:sa");
 if ($results_email == "" or "0" or NULL or FALSE or 0) {
     $results_email = scrape_between($separate_result, "Email: ", " ");
     if ($results_email == "" or "0" or NULL or FALSE or 0) {
         $results_email = scrape_between($separate_result, "email: ", " ");
         if ($results_email == "" or "0" or NULL or FALSE or 0) {
             $results_email = scrape_between($separate_result, "e: ", " ");
             if ($results_email == "" or "0" or NULL or FALSE or 0) {
                 $results_email = scrape_between($separate_result, "Contact ", "</p>");
             } else {
                 $results_email = "no Contact info found";
             }
         }
     }
 }
 $results_website = scrape_between($separate_result, "RELATED LINKS", "<!--startclickprintexclude-->");
 $servername = "localhost";
 $username = "******";
 $password = "";
 $dbname = "prnewswire";
 try {
     $conn = new PDO("mysql:host={$servername};dbname={$dbname}", $username, $password);
     // set the PDO error mode to exception
     $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
     // prepare sql and bind parameters
     $stmt = $conn->prepare("INSERT INTO article (title,url,person,email,website,date)\n                                           VALUES (:results_title,:results_url,:results_name,:results_email,:results_website,:results_date)");
     $stmt->bindParam(':results_title', $results_title);
     $stmt->bindParam(':results_url', $results_url);
     $stmt->bindParam(':results_name', $results_name);
     $stmt->bindParam(':results_email', $results_email);
     $stmt->bindParam(':results_website', $results_website);
Esempio n. 7
0
 }
 $state = scrape_between($scraped_page, "state:");
 $creation = scrape_between($scraped_page, "created:");
 if ($creation == '') {
     $creation = scrape_between($scraped_page, "Creation Date:");
 }
 $reg = scrape_between($scraped_page, "registrar:");
 $deactivation = scrape_between($scraped_page, "deactivationdate:");
 $delete = scrape_between($scraped_page, "date_to_delete:");
 $release = scrape_between($scraped_page, "date_to_release:");
 $modified = scrape_between($scraped_page, "modified:");
 $expires = scrape_between($scraped_page, "expires:");
 if ($expires == '') {
     $expires = scrape_between($scraped_page, "Registrar Registration Expiration Date:");
 }
 $notfound = scrape_between($scraped_page, '"' . $d . '" not found.');
 if ($notfound != '') {
     $state = "free";
 }
 //Ccheck  if empty and format date
 $creation = $creation != "" ? date_format(date_create($creation), 'Y-m-d') : "";
 $deactivation = $deactivation != "" ? date_format(date_create($deactivation), 'Y-m-d') : "";
 $delete = $delete != "" ? date_format(date_create($delete), 'Y-m-d') : "";
 $release = $release != "" ? date_format(date_create($release), 'Y-m-d') : "";
 $modified = $modified != "" ? date_format(date_create($modified), 'Y-m-d') : "";
 $expires = $expires != "" ? date_format(date_create($expires), 'Y-m-d') : "";
 array_push($export, array("domain" => $d, "ns1" => $ns1, "ns2" => $ns2, "rname" => $rname, "state" => $state, "creation" => $creation, "reg" => $reg, "deactivation" => $deactivation, "delete" => $delete, "modified" => $modified, "expires" => $expires));
 echo '<tr>';
 echo '<td>' . $d . '</td>';
 echo '<td>' . $ns1 . '</td>';
 echo '<td>' . $ns2 . '</td>';
Esempio n. 8
0
function getOfferPrice($result)
{
    preg_match_all('/"selling-price-id" itemprop="price">(.*)</i', $result, $offer);
    $ret = scrape_between($offer[0][0], '"selling-price-id" itemprop="price">', '</span><');
    return $ret;
}
Esempio n. 9
0
    // Closing cURL
    return $data;
}
function scrape_between($data, $start, $end)
{
    $data = stristr($data, $start);
    // Stripping all data from before $start
    $data = substr($data, strlen($start));
    // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
for ($i = 1; $i < 20; $i++) {
    $url = "http://xkcd.com/" . $i;
    // Assigning the URL we want to scrape to the variable $url
    $results_page = curl($url);
    // Downloading the results page using our curl() funtion
    $results_page = scrape_between($results_page, "<div id=\"comic\">", "</div>");
    // Scraping out only the middle section of the results page that contains our results
    $array;
    $array[] = array('url' => "http:" . scrape_between($results_page, "src=\"", "\" title="), 'title' => scrape_between($results_page, "title=\"", "\" alt="));
}
foreach ($array as $xkcd) {
    echo '<p>' . $xkcd['title'] . '</p>';
    echo '<img src="' . $xkcd['url'] . '">';
    echo '<br>';
}
Esempio n. 10
0
    curl_setopt($ch, CURLOPT_URL, $url);
    // Setting cURL's URL option with the $url variable passed into the function
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
    // Setting cURL's option to return the webpage data
    $data = curl_exec($ch);
    // Executing the cURL request and assigning the returned data to the $data variable
    curl_close($ch);
    // Closing cURL
    return $data;
    // Returning the data from the function
}
// Defining the basic scraping function
function scrape_between($data, $start, $end)
{
    $data = stristr($data, $start);
    // Stripping all data from before $start
    //$data = substr($data, strlen($start));  // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
$data = curl("http://www.goodreads.com/search?utf8=%E2%9C%93&query=" . $query);
$data = scrape_between($data, "class=\"tableList", "/table");
$doc = new DOMDocument();
$doc->loadHTML($data);
$div = $doc->getElementsByTagName("a");
$mytext = $div->item(1)->getAttribute("href");
echo $mytext;
Esempio n. 11
0
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
for ($i = 1; $i < 3; $i++) {
    $url = 'http://www.amazon.com/Motorola-Moto-3rd-generation-Unlocked/product-reviews/B00ZQVSKSM/ref=cm_cr_pr_btm_link_1?ie=UTF8&pageNumber=' . $i . '&sortBy=recent&reviewerType=all_reviews&formatType=all_formats&filterByStar=all_stars';
    // Assigning the URL we want to scrape to the variable $url
    $results_page = curl($url);
    // Downloading the results page using our curl() funtion
    $results_page = scrape_between($results_page, "<div id=\"cm_cr-review_list\" class=\"a-section a-spacing-none reviews celwidget\">", "<div class=\"a-form-actions a-spacing-top-extra-large\">");
    // Scraping out only the middle section of the results page that contains our results
    $separate_results = explode("<div class=\"a-row helpful-votes-count\">", $results_page);
    // Expploding the results into separate parts into an array
    // For each separate result, scrape the URL
    $array;
    foreach ($separate_results as $separate_result) {
        if ($separate_result != "") {
            $array[] = array('rating' => scrape_between($separate_result, "<i class=\"a-icon a-icon-star a-star-", " review-rating\""), 'review' => scrape_between($separate_result, "<span class=\"a-size-base review-text\">", "</span>"));
            //$array[] = array('url' => "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title=") ,'title' => scrape_between($separate_result, "\" title=\""  , "\">") );
        }
    }
    //print_r($array); // Printing out our array of URLs we've just scraped
    foreach ($array as $key) {
        echo '<p>Rating: ' . $key['rating'] . '</p>';
        echo '<p>' . $key['review'] . '</p>';
        echo "<hr>";
    }
}
Esempio n. 12
0
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    // follow redirects if any
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    // max. seconds to execute
    curl_setopt($ch, CURLOPT_FAILONERROR, 1);
    // stop when it encounters an error
    // curl_setopt($ch, CURLOPT_PROXY, $proxy);
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    $res = curl_exec($ch);
    if ($res === false) {
        echo 'Curl error #' . curl_errno($ch) . ': ' . curl_error($ch);
    }
    return $res;
}
function scrape_between($data, $start, $end)
{
    $data = stristr($data, $start);
    // Stripping all data from before $start
    $data = substr($data, strlen($start));
    // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
$result = getHTML("https://www.facebook.com/raysoflovearpan", 0);
$id = scrape_between($result, "r.php?profile_id=", "&");
echo $id;
Esempio n. 13
0
File: curl.php Progetto: jassyr/test
// Assigning the URL we want to scrape to the variable $url
// While $continue is TRUE, i.e. there are more search results pages
while ($continue == TRUE) {
    $results_page = curl($url);
    // Downloading the results page using our curl() funtion
    $results_page = scrape_between($results_page, "<div id=\"main\">", "<div id=\"sidebar\">");
    // Scraping out only the middle section of the results page that contains our results
    $separate_results = explode("<td class=\"image\">", $results_page);
    // Exploding the results into separate parts into an array
    // For each separate result, scrape the URL
    foreach ($separate_results as $separate_result) {
        if ($separate_result != "") {
            $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title=");
            // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array
        }
    }
    // Searching for a 'Next' link. If it exists scrape the url and set it as $url for the next loop of the scraper
    if (strpos($results_page, "Next&nbsp;&raquo;")) {
        $continue = TRUE;
        $url = scrape_between($results_page, "<span class=\"pagination\">", "</span>");
        if (strpos($url, "Prev</a>")) {
            $url = scrape_between($url, "Prev</a>", ">Next");
        }
        $url = "http://www.imdb.com" . scrape_between($url, "href=\"", "\"");
    } else {
        $continue = FALSE;
        // Setting $continue to FALSE if there's no 'Next' link
    }
    sleep(rand(3, 5));
    // Sleep for 3 to 5 seconds. Useful if not using proxies. We don't want to get into trouble.
}
Esempio n. 14
0
// Scraping out only the middle section of the results page that contains our results
$separate_results = explode('</p>', $results_page);
// Exploding the results into separate parts into an array
// var_dump($separate_results);
// For each separate result, scrape the URL
//foreach ($separate_results as $separate_result) {
//    if ($separate_result != "") {
//        $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="); // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array
//    }
// }
// Searching for a 'Next' link. If it exists scrape the url and set it as $url for the next loop of the scraper
// if (strpos($results_page, "Next&nbsp;&raquo;")) {
//     $continue = TRUE;
//     $url = scrape_between($results_page, "<span class=\"pagination\">", "</span>");
//     if (strpos($url, "Prev</a>")) {
//         $url = scrape_between($url, "Prev</a>", ">Next");
//     }
//     $url = "http://www.imdb.com" . scrape_between($url, "href=\"", "\"");
// } else {
//     $continue = FALSE;  // Setting $continue to FALSE if there's no 'Next' link
// }
//       sleep(rand(50,60));   // Sleep for 3 to 5 seconds. Useful if not using proxies. We don't want to get into trouble.
//   }
foreach ($separate_results as $result) {
    $link = scrape_between($result, 'href="', '"');
    //var_dump($link);
    $date = scrape_between($result, 'date', '</');
    //var_dump($date);
    $linkArray[] = array('link' => 'http://sfbay.craigslist.org' . $link, 'date' => $date);
}
var_dump($linkArray);
Esempio n. 15
0
{
    $scrape_from_start = stristr($html, $start);
    $scrape_end_to_start = stristr($scrape_from_start, $end, true);
    return $scrape_end_to_start;
    //$start is included in the scrape; $end is not
}
// strip links
function strip_links($html)
{
    $html = str_replace('</a>', '', $html);
    $html = preg_replace('/<a[^>]+href[^>]+>/', '', $html);
    return $html;
}
$url = "http://services.parliament.uk/calendar/#!/calendar/Commons/MainChamber/2015/10/30/events.html";
$raw_html = curl($url);
$calendar = scrape_between("<div id=\"events-output\">", "</div>", $raw_html);
?>
</header>

<div class="content_wrapper">

<section id="home">
	<article>
		<div id="swapper_container">
			<h3><i>The Cabinet</i></h3>
			<div class="image_swapper">
				<button id="left_button"><--</button>
				<img id="selected_image" class="swap_image" src="the_parties/leaders/conservatives/david_cameron.jpg">
				<img class="swap_image" src="the_parties/leaders/conservatives/george_osborne.jpg">
				<img class="swap_image" src="the_parties/leaders/conservatives/theresa_may.jpg">
				<button id="right_button">--></button>
Esempio n. 16
0
{
    $scrape_from_start = stristr($html, $start);
    $scrape_end_to_start = stristr($scrape_from_start, $end, true);
    return $scrape_end_to_start;
    //$start is included in the scrape; $end is not
}
// strip links
function strip_links($html)
{
    $html = str_replace('</a>', '', $html);
    $html = preg_replace('/<a[^>]+href[^>]+>/', '', $html);
    return $html;
}
$url = "http://services.parliament.uk/bills/";
$raw_html = curl($url);
$bills_table = scrape_between("<tbody>", "</tbody>", $raw_html);
// to match all Commons Bills and date and store in $commons_macthes
preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bBill)\\s+[^[<][^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $commons_matches);
// to match all House of Lord Bills and date and store in $lords_matches
preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bBill)\\s\\[HL\\][^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $lords_matches);
// to match all passed law "Acts" and date and store in $passed_matches
preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bAct\\s+[\\d]+)[^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $passed_matches);
// count the size of commons and lords arrays to be used later in for loop. No need to count both groups in the array as they are the same size
$commons_count = count($commons_matches[1]);
$lords_count = count($lords_matches[1]);
$passed_count = count($passed_matches[1]);
?>
</header>

<div class="content_wrapper">
	<div class="sidebar">
Esempio n. 17
0
}
// Defining the basic scraping function
function scrape_between($data, $start, $end)
{
    $data = stristr($data, $start);
    // Stripping all data from before $start
    $data = substr($data, strlen($start));
    // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
$where = "../static/images/" . $bkname . ".jpg";
$data = curl("http://www.goodreads.com" . $myurl);
//$data = scrape_between($data, "description", "</div>");
$data = scrape_between($data, "bookCoverPrimary\">", "</div>");
// Scraping out only the middle section of the results page that contains our results
if ($data != "") {
    $doc = new DOMDocument();
    $doc->loadHTML($data);
    $img = $doc->getElementById("coverImage");
    $data = $img->getAttribute("src");
    echo "<img height = \"342\" id=\"cover\" src=\"" . $data . "\">";
    file_put_contents($where, curl($data));
} else {
    echo "<img height = \"342\" id=\"cover\" src=\"../static/images/notavailable.jpg\">";
    file_put_contents($where, "../static/images/notavailable.jpg");
}
Esempio n. 18
0
    $data = substr($data, strlen($start));
    // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
$url = "http://www.imdb.com/search/title?genres=action";
// Assigning the URL we want to scrape to the variable $url
$results_page = curl($url);
// Downloading the results page using our curl() funtion
$results_page = scrape_between($results_page, "<div id=\"main\">", "<div id=\"sidebar\">");
// Scraping out only the middle section of the results page that contains our results
$separate_results = explode("<td class=\"image\">", $results_page);
// Expploding the results into separate parts into an array
// For each separate result, scrape the URL
$array;
foreach ($separate_results as $separate_result) {
    if ($separate_result != "") {
        $array[] = array('url' => "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="), 'title' => scrape_between($separate_result, "\" title=\"", "\">"));
        // $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="); // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array
        //$results_title[] = scrape_between($separate_result, "\" title=", "\">");
    }
}
//print_r($array); // Printing out our array of URLs we've just scraped
foreach ($array as $key) {
    echo "<a href='" . $key['url'] . "' >" . $key['title'] . "</a>";
    echo "<br>";
}
Esempio n. 19
0
// Defining the basic scraping function
function scrape_between($data, $start, $end)
{
    $data = stristr($data, $start);
    // Stripping all data from before $start
    $data = substr($data, strlen($start));
    // Stripping $start
    $stop = stripos($data, $end);
    // Getting the position of the $end of the data to scrape
    $data = substr($data, 0, $stop);
    // Stripping all data from after and including the $end of the data to scrape
    return $data;
    // Returning the scraped data from the function
}
$data = curl("http://www.goodreads.com" . $myurl);
$data = scrape_between($data, "<div id=\"description\"", "</div>");
// Scraping out only the middle section of the results page that contains our results
if ($data != "") {
    $doc = new DOMDocument();
    $doc->loadHTML($data);
    $div = $doc->getElementsByTagName("span");
    if ($div->item(1)) {
        $mytext = $div->item(1)->nodeValue;
    } else {
        $mytext = $div->item(0)->nodeValue;
    }
    echo $mytext;
} else {
    echo "Not Available";
}
//header('Location: ../../a.php');