// Assigning the URL we want to scrape to the variable $url $pages = 1; $page = 1; // While $continue is TRUE, i.e. there are more search results pages while ($continue == TRUE) { echo "page " . $page++ . " "; $results_page = curl($url); // Downloading the results page using our curl() function $results_page = scrape_between($results_page, "<div class=\"col-sm-9\">", "</main>"); // Scraping out only the middle section of the results page that contains our results $separate_results = explode("<a class=\"news-release\" title=\"", $results_page); // Expploding the results into separate parts into an array // For each separate result, scrape the URL foreach ($separate_results as $separate_result) { if ($separate_result != "") { $results_urls = scrape_between($separate_result, "href=\"", ".html\">") . ".html"; $results_urls = (string) $results_urls; } $servername = "localhost"; $username = "******"; $password = ""; $dbname = "prnewswire"; try { $conn = new PDO("mysql:host={$servername};dbname={$dbname}", $username, $password); // set the PDO error mode to exception $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); $sql = "INSERT INTO urls (url)\n VALUES ('{$results_urls}')"; // use exec() because no results are returned $conn->exec($sql); echo "New record created successfully" . "\n"; } catch (PDOException $e) {
function geturl($query) { $query = preg_replace('/\\ /', "+", $query); $data = curl("http://www.goodreads.com/search?utf8=%E2%9C%93&query=" . $query); $data = scrape_between($data, "class=\"tableList", "/table"); if ($data != "") { $doc = new DOMDocument(); $doc->loadHTML($data); $div = $doc->getElementsByTagName("a"); $mytext = $div->item(1)->getAttribute("href"); return $mytext; } else { return "booknotfound"; } }
function check_e_outage() { $url = "https://www.washington.edu/cac/outages"; $options = array(CURLOPT_RETURNTRANSFER => TRUE, CURLOPT_FOLLOWLOCATION => TRUE, CURLOPT_AUTOREFERER => TRUE, CURLOPT_CONNECTTIMEOUT => 120, CURLOPT_TIMEOUT => 120, CURLOPT_MAXREDIRS => 10, CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", CURLOPT_URL => $url); $ch = curl_init(); // Initialising cURL curl_setopt_array($ch, $options); // Setting cURL's options using the previously assigned array data in $options $data = curl_exec($ch); // Executing the cURL request and assigning the returned data to the $data variable curl_close($ch); // Closing cURL $status = scrape_between($data, "<div class=\"status\">", "</body>"); if (strpos($data, "not operating normally")) { return $status; } else { return false; } }
$sources = [["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 1], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][2]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][2]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][3]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][3]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][4]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][4]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 2], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][5]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][5]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 4], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_reglamentaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][6]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][6]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 6], ["url" => "https://es.wikipedia.org/wiki/Anexo:Señales_de_tráfico_de_peligro_de_España", "text_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='gallerytext']", "image_query" => "//ul[@class='gallery mw-gallery-traditional'][1]/li[@class='gallerybox']/div/div[@class='thumb']/div/a/img/@src", "subclass" => 3], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][1]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][1]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][2]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][2]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true], ["url" => "https://es.wikipedia.org/wiki/Anexo:Se%C3%B1ales_de_tr%C3%A1fico_de_indicaci%C3%B3n_de_Espa%C3%B1a", "text_query" => "//table[@class='wikitable'][3]/tr[position()>1]/td[2]", "image_query" => "//table[@class='wikitable'][3]/tr/td[1]//a/img/@src", "subclass" => 5, "is_table" => true]]; foreach ($sources as $source) { $html = file_get_contents($source['url']); $dom = new DOMDocument(); @$dom->loadHtml($html); $xpath = new DOMXPath($dom); $classes = array(); $texts = $xpath->query($source['text_query']); $images = $xpath->query($source['image_query']); for ($i = 0; $i < $images->length; $i++) { if (isset($source['is_table']) && $source['is_table']) { $sign_id = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<br>', '</b>')))); $sign_text = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<b>', '<br>')))); } else { $sign_id = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<b>', '<br>')))); $sign_text = trim(strip_tags(html_entity_decode(scrape_between(getInnerHTML($texts->item($i)), '<br>', '</center>')))); } $sign_id = str_replace(' ', '', $sign_id); if (empty($sign_id)) { $sign_id = NULL; } $image = "https:" . $images->item($i)->nodeValue; $classes[$i]['spain_id'] = $sign_id; $classes[$i]['text'] = $sign_text; $classes[$i]['image'] = $image; } saveClasses($classes, $source['subclass']); } function saveClasses($classes, $subclass) { foreach ($classes as $class) {
<?php // Defining the basic cURL function function curl($url) { $ch = curl_init(); // Initialising cURL curl_setopt($ch, CURLOPT_URL, $url); // Setting cURL's URL option with the $url variable passed into the function curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Setting cURL's option to return the webpage data $data = curl_exec($ch); // Executing the cURL request and assigning the returned data to the $data variable curl_close($ch); // Closing cURL return $data; // Returning the data from the function } $scraped_website = curl("http://www.snapdeal.com/product/intex-10000-mah-itpb11-power/1403089232"); $scraped_data = scrape_between($scraped_page, "payBlkBig", "</span>"); // Scraping downloaded dara in $scraped_page for content between <title> and </title> tags echo $scraped_data; // Echoing $scraped data, should show "The Internet Movie Database (IMDb)"
$results_date = date("h:i:sa"); if ($results_email == "" or "0" or NULL or FALSE or 0) { $results_email = scrape_between($separate_result, "Email: ", " "); if ($results_email == "" or "0" or NULL or FALSE or 0) { $results_email = scrape_between($separate_result, "email: ", " "); if ($results_email == "" or "0" or NULL or FALSE or 0) { $results_email = scrape_between($separate_result, "e: ", " "); if ($results_email == "" or "0" or NULL or FALSE or 0) { $results_email = scrape_between($separate_result, "Contact ", "</p>"); } else { $results_email = "no Contact info found"; } } } } $results_website = scrape_between($separate_result, "RELATED LINKS", "<!--startclickprintexclude-->"); $servername = "localhost"; $username = "******"; $password = ""; $dbname = "prnewswire"; try { $conn = new PDO("mysql:host={$servername};dbname={$dbname}", $username, $password); // set the PDO error mode to exception $conn->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); // prepare sql and bind parameters $stmt = $conn->prepare("INSERT INTO article (title,url,person,email,website,date)\n VALUES (:results_title,:results_url,:results_name,:results_email,:results_website,:results_date)"); $stmt->bindParam(':results_title', $results_title); $stmt->bindParam(':results_url', $results_url); $stmt->bindParam(':results_name', $results_name); $stmt->bindParam(':results_email', $results_email); $stmt->bindParam(':results_website', $results_website);
} $state = scrape_between($scraped_page, "state:"); $creation = scrape_between($scraped_page, "created:"); if ($creation == '') { $creation = scrape_between($scraped_page, "Creation Date:"); } $reg = scrape_between($scraped_page, "registrar:"); $deactivation = scrape_between($scraped_page, "deactivationdate:"); $delete = scrape_between($scraped_page, "date_to_delete:"); $release = scrape_between($scraped_page, "date_to_release:"); $modified = scrape_between($scraped_page, "modified:"); $expires = scrape_between($scraped_page, "expires:"); if ($expires == '') { $expires = scrape_between($scraped_page, "Registrar Registration Expiration Date:"); } $notfound = scrape_between($scraped_page, '"' . $d . '" not found.'); if ($notfound != '') { $state = "free"; } //Ccheck if empty and format date $creation = $creation != "" ? date_format(date_create($creation), 'Y-m-d') : ""; $deactivation = $deactivation != "" ? date_format(date_create($deactivation), 'Y-m-d') : ""; $delete = $delete != "" ? date_format(date_create($delete), 'Y-m-d') : ""; $release = $release != "" ? date_format(date_create($release), 'Y-m-d') : ""; $modified = $modified != "" ? date_format(date_create($modified), 'Y-m-d') : ""; $expires = $expires != "" ? date_format(date_create($expires), 'Y-m-d') : ""; array_push($export, array("domain" => $d, "ns1" => $ns1, "ns2" => $ns2, "rname" => $rname, "state" => $state, "creation" => $creation, "reg" => $reg, "deactivation" => $deactivation, "delete" => $delete, "modified" => $modified, "expires" => $expires)); echo '<tr>'; echo '<td>' . $d . '</td>'; echo '<td>' . $ns1 . '</td>'; echo '<td>' . $ns2 . '</td>';
function getOfferPrice($result) { preg_match_all('/"selling-price-id" itemprop="price">(.*)</i', $result, $offer); $ret = scrape_between($offer[0][0], '"selling-price-id" itemprop="price">', '</span><'); return $ret; }
// Closing cURL return $data; } function scrape_between($data, $start, $end) { $data = stristr($data, $start); // Stripping all data from before $start $data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } for ($i = 1; $i < 20; $i++) { $url = "http://xkcd.com/" . $i; // Assigning the URL we want to scrape to the variable $url $results_page = curl($url); // Downloading the results page using our curl() funtion $results_page = scrape_between($results_page, "<div id=\"comic\">", "</div>"); // Scraping out only the middle section of the results page that contains our results $array; $array[] = array('url' => "http:" . scrape_between($results_page, "src=\"", "\" title="), 'title' => scrape_between($results_page, "title=\"", "\" alt=")); } foreach ($array as $xkcd) { echo '<p>' . $xkcd['title'] . '</p>'; echo '<img src="' . $xkcd['url'] . '">'; echo '<br>'; }
curl_setopt($ch, CURLOPT_URL, $url); // Setting cURL's URL option with the $url variable passed into the function curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Setting cURL's option to return the webpage data $data = curl_exec($ch); // Executing the cURL request and assigning the returned data to the $data variable curl_close($ch); // Closing cURL return $data; // Returning the data from the function } // Defining the basic scraping function function scrape_between($data, $start, $end) { $data = stristr($data, $start); // Stripping all data from before $start //$data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } $data = curl("http://www.goodreads.com/search?utf8=%E2%9C%93&query=" . $query); $data = scrape_between($data, "class=\"tableList", "/table"); $doc = new DOMDocument(); $doc->loadHTML($data); $div = $doc->getElementsByTagName("a"); $mytext = $div->item(1)->getAttribute("href"); echo $mytext;
$stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } for ($i = 1; $i < 3; $i++) { $url = 'http://www.amazon.com/Motorola-Moto-3rd-generation-Unlocked/product-reviews/B00ZQVSKSM/ref=cm_cr_pr_btm_link_1?ie=UTF8&pageNumber=' . $i . '&sortBy=recent&reviewerType=all_reviews&formatType=all_formats&filterByStar=all_stars'; // Assigning the URL we want to scrape to the variable $url $results_page = curl($url); // Downloading the results page using our curl() funtion $results_page = scrape_between($results_page, "<div id=\"cm_cr-review_list\" class=\"a-section a-spacing-none reviews celwidget\">", "<div class=\"a-form-actions a-spacing-top-extra-large\">"); // Scraping out only the middle section of the results page that contains our results $separate_results = explode("<div class=\"a-row helpful-votes-count\">", $results_page); // Expploding the results into separate parts into an array // For each separate result, scrape the URL $array; foreach ($separate_results as $separate_result) { if ($separate_result != "") { $array[] = array('rating' => scrape_between($separate_result, "<i class=\"a-icon a-icon-star a-star-", " review-rating\""), 'review' => scrape_between($separate_result, "<span class=\"a-size-base review-text\">", "</span>")); //$array[] = array('url' => "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title=") ,'title' => scrape_between($separate_result, "\" title=\"" , "\">") ); } } //print_r($array); // Printing out our array of URLs we've just scraped foreach ($array as $key) { echo '<p>Rating: ' . $key['rating'] . '</p>'; echo '<p>' . $key['review'] . '</p>'; echo "<hr>"; } }
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow redirects if any curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); // max. seconds to execute curl_setopt($ch, CURLOPT_FAILONERROR, 1); // stop when it encounters an error // curl_setopt($ch, CURLOPT_PROXY, $proxy); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); $res = curl_exec($ch); if ($res === false) { echo 'Curl error #' . curl_errno($ch) . ': ' . curl_error($ch); } return $res; } function scrape_between($data, $start, $end) { $data = stristr($data, $start); // Stripping all data from before $start $data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } $result = getHTML("https://www.facebook.com/raysoflovearpan", 0); $id = scrape_between($result, "r.php?profile_id=", "&"); echo $id;
// Assigning the URL we want to scrape to the variable $url // While $continue is TRUE, i.e. there are more search results pages while ($continue == TRUE) { $results_page = curl($url); // Downloading the results page using our curl() funtion $results_page = scrape_between($results_page, "<div id=\"main\">", "<div id=\"sidebar\">"); // Scraping out only the middle section of the results page that contains our results $separate_results = explode("<td class=\"image\">", $results_page); // Exploding the results into separate parts into an array // For each separate result, scrape the URL foreach ($separate_results as $separate_result) { if ($separate_result != "") { $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="); // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array } } // Searching for a 'Next' link. If it exists scrape the url and set it as $url for the next loop of the scraper if (strpos($results_page, "Next »")) { $continue = TRUE; $url = scrape_between($results_page, "<span class=\"pagination\">", "</span>"); if (strpos($url, "Prev</a>")) { $url = scrape_between($url, "Prev</a>", ">Next"); } $url = "http://www.imdb.com" . scrape_between($url, "href=\"", "\""); } else { $continue = FALSE; // Setting $continue to FALSE if there's no 'Next' link } sleep(rand(3, 5)); // Sleep for 3 to 5 seconds. Useful if not using proxies. We don't want to get into trouble. }
// Scraping out only the middle section of the results page that contains our results $separate_results = explode('</p>', $results_page); // Exploding the results into separate parts into an array // var_dump($separate_results); // For each separate result, scrape the URL //foreach ($separate_results as $separate_result) { // if ($separate_result != "") { // $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="); // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array // } // } // Searching for a 'Next' link. If it exists scrape the url and set it as $url for the next loop of the scraper // if (strpos($results_page, "Next »")) { // $continue = TRUE; // $url = scrape_between($results_page, "<span class=\"pagination\">", "</span>"); // if (strpos($url, "Prev</a>")) { // $url = scrape_between($url, "Prev</a>", ">Next"); // } // $url = "http://www.imdb.com" . scrape_between($url, "href=\"", "\""); // } else { // $continue = FALSE; // Setting $continue to FALSE if there's no 'Next' link // } // sleep(rand(50,60)); // Sleep for 3 to 5 seconds. Useful if not using proxies. We don't want to get into trouble. // } foreach ($separate_results as $result) { $link = scrape_between($result, 'href="', '"'); //var_dump($link); $date = scrape_between($result, 'date', '</'); //var_dump($date); $linkArray[] = array('link' => 'http://sfbay.craigslist.org' . $link, 'date' => $date); } var_dump($linkArray);
{ $scrape_from_start = stristr($html, $start); $scrape_end_to_start = stristr($scrape_from_start, $end, true); return $scrape_end_to_start; //$start is included in the scrape; $end is not } // strip links function strip_links($html) { $html = str_replace('</a>', '', $html); $html = preg_replace('/<a[^>]+href[^>]+>/', '', $html); return $html; } $url = "http://services.parliament.uk/calendar/#!/calendar/Commons/MainChamber/2015/10/30/events.html"; $raw_html = curl($url); $calendar = scrape_between("<div id=\"events-output\">", "</div>", $raw_html); ?> </header> <div class="content_wrapper"> <section id="home"> <article> <div id="swapper_container"> <h3><i>The Cabinet</i></h3> <div class="image_swapper"> <button id="left_button"><--</button> <img id="selected_image" class="swap_image" src="the_parties/leaders/conservatives/david_cameron.jpg"> <img class="swap_image" src="the_parties/leaders/conservatives/george_osborne.jpg"> <img class="swap_image" src="the_parties/leaders/conservatives/theresa_may.jpg"> <button id="right_button">--></button>
{ $scrape_from_start = stristr($html, $start); $scrape_end_to_start = stristr($scrape_from_start, $end, true); return $scrape_end_to_start; //$start is included in the scrape; $end is not } // strip links function strip_links($html) { $html = str_replace('</a>', '', $html); $html = preg_replace('/<a[^>]+href[^>]+>/', '', $html); return $html; } $url = "http://services.parliament.uk/bills/"; $raw_html = curl($url); $bills_table = scrape_between("<tbody>", "</tbody>", $raw_html); // to match all Commons Bills and date and store in $commons_macthes preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bBill)\\s+[^[<][^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $commons_matches); // to match all House of Lord Bills and date and store in $lords_matches preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bBill)\\s\\[HL\\][^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $lords_matches); // to match all passed law "Acts" and date and store in $passed_matches preg_match_all("/href=\"([^\"]*?)\">\\s*?\\b([^\n]*?\\bAct\\s+[\\d]+)[^\\d]*([0-3]\\d\\.[0-1]\\d\\.2\\d{3})/", $bills_table, $passed_matches); // count the size of commons and lords arrays to be used later in for loop. No need to count both groups in the array as they are the same size $commons_count = count($commons_matches[1]); $lords_count = count($lords_matches[1]); $passed_count = count($passed_matches[1]); ?> </header> <div class="content_wrapper"> <div class="sidebar">
} // Defining the basic scraping function function scrape_between($data, $start, $end) { $data = stristr($data, $start); // Stripping all data from before $start $data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } $where = "../static/images/" . $bkname . ".jpg"; $data = curl("http://www.goodreads.com" . $myurl); //$data = scrape_between($data, "description", "</div>"); $data = scrape_between($data, "bookCoverPrimary\">", "</div>"); // Scraping out only the middle section of the results page that contains our results if ($data != "") { $doc = new DOMDocument(); $doc->loadHTML($data); $img = $doc->getElementById("coverImage"); $data = $img->getAttribute("src"); echo "<img height = \"342\" id=\"cover\" src=\"" . $data . "\">"; file_put_contents($where, curl($data)); } else { echo "<img height = \"342\" id=\"cover\" src=\"../static/images/notavailable.jpg\">"; file_put_contents($where, "../static/images/notavailable.jpg"); }
$data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } $url = "http://www.imdb.com/search/title?genres=action"; // Assigning the URL we want to scrape to the variable $url $results_page = curl($url); // Downloading the results page using our curl() funtion $results_page = scrape_between($results_page, "<div id=\"main\">", "<div id=\"sidebar\">"); // Scraping out only the middle section of the results page that contains our results $separate_results = explode("<td class=\"image\">", $results_page); // Expploding the results into separate parts into an array // For each separate result, scrape the URL $array; foreach ($separate_results as $separate_result) { if ($separate_result != "") { $array[] = array('url' => "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="), 'title' => scrape_between($separate_result, "\" title=\"", "\">")); // $results_urls[] = "http://www.imdb.com" . scrape_between($separate_result, "href=\"", "\" title="); // Scraping the page ID number and appending to the IMDb URL - Adding this URL to our URL array //$results_title[] = scrape_between($separate_result, "\" title=", "\">"); } } //print_r($array); // Printing out our array of URLs we've just scraped foreach ($array as $key) { echo "<a href='" . $key['url'] . "' >" . $key['title'] . "</a>"; echo "<br>"; }
// Defining the basic scraping function function scrape_between($data, $start, $end) { $data = stristr($data, $start); // Stripping all data from before $start $data = substr($data, strlen($start)); // Stripping $start $stop = stripos($data, $end); // Getting the position of the $end of the data to scrape $data = substr($data, 0, $stop); // Stripping all data from after and including the $end of the data to scrape return $data; // Returning the scraped data from the function } $data = curl("http://www.goodreads.com" . $myurl); $data = scrape_between($data, "<div id=\"description\"", "</div>"); // Scraping out only the middle section of the results page that contains our results if ($data != "") { $doc = new DOMDocument(); $doc->loadHTML($data); $div = $doc->getElementsByTagName("span"); if ($div->item(1)) { $mytext = $div->item(1)->nodeValue; } else { $mytext = $div->item(0)->nodeValue; } echo $mytext; } else { echo "Not Available"; } //header('Location: ../../a.php');