function parseModelsPage($brandId, $brandName, $page) { $html_content = scraperwiki::scrape($page); $this->html = str_get_html($html_content); foreach ($this->html->find("div.makers a") as $el) { $img = $el->find('img', 0); $m['name'] = $brandName . ' ' . $el->find('strong', 0)->innertext; $m['img'] = $img->src; $m['link'] = 'http://www.gsmarena.com/' . $el->href; $m['desc'] = $img->title; $temp = explode('-', $el->href); $m['id'] = (int) substr($temp[1], 0, -4); $m['brand_id'] = $brandId; scraperwiki::save_sqlite(array("id" => $m['id']), $m, "cell_model"); $this->models++; } $pagination = $this->html->find("div.nav-pages", 0); if ($pagination) { $nextPageLink = $pagination->lastChild(); if ($nextPageLink && $nextPageLink->title == "Next page") { $this->parseModelsPage($brandId, $brandName, 'http://www.gsmarena.com/' . $nextPageLink->href); } } $this->html->__destruct(); }
function clubURL($url) { $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $clubName = trim(str_replace(' ', '', $dom->find('table', 0)->find('tr', 2)->plaintext)); $formatClubName = trim(preg_replace('/\\s+/', ' ', $clubName)); $_GLOBAL['clubs'][] = $formatClubName; echo 'running ' . $formatClubName . "\n"; foreach ($dom->find('table', 2)->find('tr') as $row) { if (is_numeric($row->find('td', 0)->plaintext)) { $year = trim($row->find('td', 0)->plaintext); $position = trim(str_replace(' ', '', $row->find('td', 1)->plaintext)); if (trim($position) == 'Champion') { $position = 1; } $leagueLevel = trim($row->find('td', 2)->plaintext); $overallPosition = trim($row->find('td', 3)->plaintext); $avgAttendance = trim(str_replace('.', '', $row->find('td', 4)->plaintext)); $totalAttendance = trim(str_replace('.', '', $row->find('td', 12)->plaintext)); $dataset = array('club' => $formatClubName, 'year' => $year, 'finishedPosition' => $position, 'league' => $leagueLevel, 'overallPosition' => $overallPosition, 'avgAttendance' => $avgAttendance, 'totalAttendance' => $totalAttendance); scraperwiki::save(array('club', 'year'), $dataset); } } /* * The next to lines stop a memory leak in Simple XML as per http://simplehtmldom.sourceforge.net/manual_faq.htm#memory_leak */ $dom->clear(); unset($dom); }
function getCategories($u) { global $baseurl, $f; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); echo "Loaded URL: " . $u . "\n"; if ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]')) { $breadcrumb = $d->find('div[id=breadcrumb]', 0); //foreach($breadcrumb as $b) { //echo "Breadcrumb = " . $b;} if (!is_null($breadcrumb)) { foreach ($breadcrumb->children() as $crumb) { $path .= trim($crumb->innertext) . "/"; } $path .= trim(strrchr($breadcrumb->innertext, ">"), "> "); } foreach ($d->find('div[id=ctl00_cphContent_gsaCatFacetContainer]', 0)->find('div[class=S2refinementsContainer]', 0)->children() as $div) { $name = trim(strstr($div->children(0)->innertext, "(", true)); $url = $baseurl . $div->children(0)->href; $data = array("Name" => $name, "Path" => $path, "URL" => $url); echo $path . "/" . $name . "\n"; if ($local) { fputcsv($f, array($name, $path, $url)); } else { scraperwiki::save_sqlite(array("URL"), $data); } getCategories($url); } } }
function getIngredients($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); //foreach($dom->find('result-item',1)->href as $data) //{ // if ($data != null) //$res = trim($data->plaintext); $res = $dom->find('a[class=callout]', 1)->href; $res = str_replace("reviews/", "", $res); echo "http://www.foodnetwork.com" . $res; $html1 = scraperwiki::scrape("http://www.foodnetwork.com" . $res); $domFoods = new simple_html_dom(); //$domFoods->load($html1); $h = str_get_html($html1); //echo $domFoods; echo "\n\n"; foreach ($h->find('li[class=ingredient]') as $data) { $ingredient = $data->plaintext; if (isset($h->href)) { $href = $h->href; } //foreach($domFoods->find('ul[class=kv-ingred-list1]',1)->children() as $data){ //echo $data->plaintext; scraperwiki::save(array('ing'), array('ing' => $ingredient, 'href' => $href)); } }
function getProducts($u, $cat) { global $o; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); //echo "Loaded URL: " . $u . "\n"; $items = $d->find('li.grid-item'); if (count($items) > 0) { foreach ($items as $p) { $prod = $p->find('p.product-name > a', 0); $prodname = trim($prod->innertext); $prodURL = $prod->href; if (!is_null($p->find('p.minimal-price', 0))) { $prodtype = 1; } else { $prodtype = 0; } fputcsv($o, array($prodname, $prodtype, $cat, $prodURL)); echo $prodname . "\n"; } if (!is_null($d->find('p.next', 0))) { getProducts($d->find('p.next', 0)->href, $cat); } } }
function ripById($id) { $pathToDetails = 'http://beheshtezahra.tehran.ir/Default.aspx?tabid=92&ctl=SearchDetails&mid=653&srid=' . $id; $output = scraperwiki::scrape($pathToDetails); $firstnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblNameBound_0"><b>(.*)<\\//smiU'; $surnamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblLastNameBound_0"><b>(.*)<\\//smiU'; $fathernamepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblFatherNameBound_0"><b>(.*)<\\//smiU'; $birthdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblBirthDateBound_0"><b>(.*)<\\//smiU'; $deathdatepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnDateBound_0"><b>(.*)<\\//smiU'; $deathplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDeastTownshipTitle_0"><b>(.*)<\\//smiU'; $graveplacepattern = '/<span id="dnn_ctr653_SearchDetails_dtlDetail_lblDafnPlace_0"><b>(.*)<\\//smiU'; preg_match($firstnamepattern, $output, $temp); $firstname = isset($temp[1]) ? $temp[1] : ''; preg_match($surnamepattern, $output, $temp); $surname = isset($temp[1]) ? $temp[1] : ''; preg_match($fathernamepattern, $output, $temp); $fathername = isset($temp[1]) ? $temp[1] : ''; preg_match($birthdatepattern, $output, $temp); $birthdate = isset($temp[1]) ? $temp[1] : ''; preg_match($deathdatepattern, $output, $temp); $deathdate = isset($temp[1]) ? $temp[1] : ''; preg_match($deathplacepattern, $output, $temp); $deathplace = isset($temp[1]) ? $temp[1] : ''; preg_match($graveplacepattern, $output, $temp); $graveplace = isset($temp[1]) ? $temp[1] : ''; scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'deathplace' => $deathplace, 'graveplace' => $graveplace)); }
function do_day($rec) { $html = scraperwiki::scrape($rec['url']); $dom = new simple_html_dom(); $dom->load($html); $cell = $dom->find('a[name=discs]'); $lines = $cell[0]->parent->find('text'); print $lines[10] . "\n"; print count($lines) . "\n"; # loop by number, as null lines stop a foreach $n = 0; for ($line_no = 0; $line_no < count($lines); $line_no++) { $line = $lines[$line_no]; if (strlen($line) == 3) { # the DOM object crashes on this row, so ignore continue; } #if (preg_match("#^" . $n . "#", $line, $matches)) { print $line_no . " " . strlen($line) . "\n"; $n = $n + 1; print $line . "\n"; #} } #scraperwiki::save(array('data'), array('data' => $data->plaintext)); }
function handle_products($product_link) { global $base_url_host, $base_url_scheme, $total; if (!empty($product_link)) { $link_3 = $product_link; $cat_raw = str_replace("http://www.thule.com/en-US/US/Products/", "", $product_link); $cats = dirname($cat_raw); $cat_terms = array("Base-Racks/Feet", "Base-Racks/LoadAccessories", "Base-Racks/LoadBars", "Bike-Carriers/Accessories", "Bike-Carriers/Hitch", "Bike-Carriers/RearDoor", "Bike-Carriers/RoofCarriers", "Bike-Carriers/SpareTire", "Bike-Carriers/TruckBed", "Cargo-Carriers/Bags", "Cargo-Carriers/Baskets", "Cargo-Carriers/Boxes", "Cargo-Carriers/HitchCargo", "Luggage/DaypacksAndMessengers", "Luggage/LaptopAndTablet", "Luggage/LuggageAndDuffels", "Snow-Chains/SnowChains", "Snowsports/Accessories", "Snowsports/HitchSki", "Snowsports/SkiBoxes", "Snowsports/SkiCarriers", "Watersports/Accessories", "Watersports/WatersportCarriers"); $cat_cleaned = array("Base Racks/Feet", "Base Racks/Load Accessories", "Base Racks/Load Bars", "Bike Carriers/Accessories", "Bike Carriers/Hitch", "Bike Carriers/Rear Door", "Bike Carriers/Roof Carriers", "Bike Carriers/Spare Tire", "Bike Carriers/Truck Bed", "Cargo Carriers/Bags", "Cargo Carriers/Baskets", "Cargo Carriers/Boxes", "Cargo Carriers/Hitch Cargo", "Luggage/Daypacks And Messengers", "Luggage/Laptop And Tablet", "Luggage/Luggage And Duffels", "Snow Chains/Snow Chains", "Snowsports/Accessories", "Snowsports/Hitch Ski", "Snowsports/Ski Boxes", "Snowsports/Ski Carriers", "Watersports/Accessories", "Watersports/Watersport Carriers"); $cat = str_replace($cat_terms, $cat_cleaned, $cats); $html_content = scraperwiki::scrape($link_3); $html = str_get_html($html_content); $name_raw = trim($html->find("div[@class='column details_overview'] h2 span", 0)); $name = !empty($name_raw) ? strip_tags($name_raw) : ""; $desc_raw = trim($html->find("div[@class='column details_overview'] h3 span", 0)); $desc = !empty($desc_raw) ? strip_tags($desc_raw) : ""; $price_raw = trim($html->find("div[@class='pricing'] span[@id='phcontent_0_ctl00_lblPriceText']", 0)); $price = strip_tags($price_raw); $price = str_replace("MSRP \$", "", $price); $price = trim(str_replace(" (USD)", "", $price)); $image = $html->find("img[@id='imgProductBomImage_0']", 0)->src; echo "{$name}: {$image}\n"; // Add it to an array. $record = array('id' => $total, 'product_name' => trim($name), 'desciption' => trim($desc), 'price' => $price, 'img' => $image, 'category' => $cat); // Add it to the table. scraperwiki::save_sqlite(array('id'), array($record), "products_support", 2); // Increment the 'id' counter. $total++; } }
function scrapeDetails($ngo) { $html_content = scraperwiki::scrape($ngo["url"]); $dom = new simple_html_dom(); $dom->load($html_content); $infosWeWant = array('Telefon', 'Rechtsform', 'Steuerstatus', 'Weltanschauliche Ausrichtung', 'Anzahl Mitarbeiter', 'Gesamteinnahmen:', 'Davon Sammlungseinnahmen', 'Bezugsjahr:'); // Scrape Details from all paragraphs $paragraphs = $dom->find('p'); foreach ($paragraphs as $p) { if (strstr($p->plaintext, "Website")) { $ngo["website"] = $p->find('a', 0)->href; } if (strstr($p->plaintext, "Email")) { $ngo["email"] = $p->find('a', 0)->plaintext; } foreach ($infosWeWant as $key => $info) { $res = extractInfo($p, $info); if ($res) { $ngo[$info] = $res; //Do not search for this info again unset($infosWeWant[$key]); } } } print_r($ngo); return $ngo; }
function ripById($id) { $pathToDetails = 'http://www.shborujen.ir/DesktopModules/eFormViewer/eFormViewerEdit.aspx?TabID=4753&Site=DouranPortal&MId=14286&Lang=fa-IR&ItemID=1&fID=1228&keyID=itemid%7C' . $id; $output = scraperwiki::scrape($pathToDetails); $firstnamepattern = '/<input name="eFormEditData1228\\$field1421\\$controlToValidate_Field72\\$Field72_Value".*" value="(.*)".*>/smiU'; $surnamepattern = '/<input name="eFormEditData1228\\$field1415\\$controlToValidate_Field73\\$Field73_Value.*" value="(.*)".*>/smiU'; $fathernamepattern = '/<input name="eFormEditData1228\\$field1416\\$controlToValidate_Field74\\$Field74_Value.*value="(.*)".*>/smiU'; $deathdatepattern = '/<input name="eFormEditData1228\\$field1418\\$ctl00\\$txt.*" value="(.*)".*>/smiU'; $blockpattern = '/<input name="eFormEditData1228\\$field1414\\$controlToValidate_Field78\\$Field78_Value.*" value="(.*)".*>/smiU'; $rowpattern = '/<input name="eFormEditData1228\\$field1434\\$controlToValidate_Field1434\\$Field1434_Value.*" value="(.*)".*>/smiU'; $placepattern = '/<input name="eFormEditData1228\\$field1413\\$controlToValidate_Field77\\$Field77_Value.*" value="(.*)".*>/smiU'; $gravepattern = '/<input name="eFormEditData1228\\$field1439\\$controlToValidate_Field1439\\$Field1439_Value.*" value="(.*)".*>/smiU'; preg_match($firstnamepattern, $output, $temp); $firstname = isset($temp[1]) ? $temp[1] : ''; preg_match($surnamepattern, $output, $temp); $surname = isset($temp[1]) ? $temp[1] : ''; preg_match($fathernamepattern, $output, $temp); $fathername = isset($temp[1]) ? $temp[1] : ''; preg_match($deathdatepattern, $output, $temp); $deathdate = isset($temp[1]) ? $temp[1] : ''; preg_match($placepattern, $output, $temp); $place = isset($temp[1]) ? $temp[1] : ''; preg_match($rowpattern, $output, $temp); $row = isset($temp[1]) ? $temp[1] : ''; preg_match($blockpattern, $output, $temp); $block = isset($temp[1]) ? $temp[1] : ''; preg_match($gravepattern, $output, $temp); $grave = isset($temp[1]) ? $temp[1] : ''; scraperwiki::save_sqlite(array('data'), array('id' => $id, 'firstname' => $firstname, 'surname' => $surname, 'fathername' => $fathername, 'birthdate' => $birthdate, 'deathdate' => $deathdate, 'place' => $place, 'block' => $block, 'row' => $row, 'grave' => $grave)); }
function scraper($url_search, $country_id) { $has_next = false; $base_url = "http://ec.europa.eu/eures/eures-searchengine/servlet"; $html = scraperwiki::scrape($url_search); $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('table[class=JResult]') as $result) { foreach ($result->find('td[class=JRTitle] a') as $job_page) { $chars = explode("'", $job_page->onclick); $url_job = $base_url . substr($chars[1], 1); $url_id = strstr($url_job, 'uniqueJvId='); $url_id = str_replace('uniqueJvId=', "", $url_id); echo "JOB: " . $url_job . "<br />"; } foreach ($result->find('th') as $data) { $text = trim($data->plaintext); if ($text == 'Description:') { $description = trim($data->next_sibling()->plaintext); echo "DESCRIPTION: " . $description . "<br />"; } if ($text == 'Source:') { $source = trim($data->next_sibling()->plaintext); $source = str_replace("'", "\\'", $source); if ($source != '' && $source != ' ') { $source_id = insert_name('source', $source); echo "SOURCE: " . $source . "<br /><br />"; } } } $description = str_replace("'", "\\'", $description); $description = str_replace("</BR>", "", $description); $sql = mysql_query("SELECT * FROM job WHERE url = '{$url_job}'"); $cont = mysql_num_rows($sql); if ($cont == 0) { mysql_query("INSERT INTO job SET \n\t\t\t\t\turl = '{$url_job}', \n\t\t\t\t\turl_id = '{$url_id}', \n\t\t\t\t\tdescription = '{$description}', \n\t\t\t\t\tsource_id = '{$source_id}', \n\t\t\t\t\turl_search = '{$url_search}', \n\t\t\t\t\tcountry_id='{$country_id}',\n\t\t\t\t\turl_scraper_date = SYSDATE(),\t \n\t\t\t\t\turl_scraper_hour = SYSDATE()"); } else { echo "Job URL already extracted: " . $url_job . "<br /><br />"; } } foreach ($dom->find('div[class=prevNext] a') as $next_page) { $text = $next_page->plaintext; if ($text == "Next page") { $url_next = substr($next_page->href, 1); $url_next = $base_url . $url_next; $has_next = true; print "<br /><br />NEXT: " . $url_next . "<br /><br />"; } } unset($html, $dom, $result, $job_page, $data, $next_page, $text, $url_id, $url_job, $description, $source, $source_id, $url_search); //Comment this for tests, uncomment this to get all data // if ($has_next == true){ // sleep(1); // scraper($url_next, $country_id); // } }
function ripByPage($page) { $pathToDetails = 'http://aramestan.e-sanandaj.ir/BurialRequest/DeadSearch?keyword=&firstName=&lastName=&fatherName=&partNo=0&rowNo=&graveNo=&deathDateFrom=&deathDateTo=&bornDateFrom=&bornDateTo=&page=' . $page; $output = scraperwiki::scrape($pathToDetails); $resultingJsonObject = json_decode($output); for ($id = 0; $id <= 9; $id++) { $entry = array('id' => $resultingJsonObject->{'result'}[$id]->{'Id'}, 'fullname' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFullName'}), 'fathername' => strVal($resultingJsonObject->{'result'}[$id]->{'DeadFatherName'}), 'birthdate' => strVal($resultingJsonObject->{'result'}[$id]->{'BornDate'}), 'deathdate' => strVal($resultingJsonObject->{'result'}[$id]->{'DeathDate'}), 'partno' => strVal($resultingJsonObject->{'result'}[$id]->{'PartNo'}), 'rowno' => strVal($resultingJsonObject->{'result'}[$id]->{'RowNo'}), 'graveno' => strVal($resultingJsonObject->{'result'}[$id]->{'GraveNo'}), 'gender' => strVal($resultingJsonObject->{'result'}[$id]->{'Gender'}), 'identitycode' => strVal($resultingJsonObject->{'result'}[$id]->{'IdentityCode'})); scraperwiki::save_sqlite(array('data'), $entry); $pagecount = $resultingJsonObject->{'PageNumber'}; } }
function scrape($source) { global $source, $utmSource, $utmMedium, $utmTerm, $utmContent, $utmCampaign; $link = scraperwiki::scrape($source); $html = str_get_html($link); foreach ($html->find('a[href]') as $a) { $href = $a->href; $a->href = $href . '#utm_source=' . $utmSource . '&utm_medium=' . $utmMedium . '&utm_term=' . $utmTerm . '&utm_content=' . $utmContent . '&utm_campaign=' . $utmCampaign; } print $html; }
function getLangs() { $url = "http://mappings.dbpedia.org/server/statistics/"; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $i = 0; $langs = array(); foreach ($dom->find('/html/body/p/a') as $result) { $lang = str_replace("/", "", trim($result->href)); $langs[] = $lang; } return $langs; }
function scrapeIndex($url) { $html_content = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html_content); $ngos = array(); foreach ($dom->find('h2') as $h2) { $name = str_replace("–", "-", html_entity_decode($h2->plaintext)); $url = $h2->find('a', 0); $url = $url->href; $ngos[] = array("name" => $name, "url" => $url); scraperwiki::save_sqlite(array("name"), array("name" => $name, "url" => $url), "ngos"); } print_r($ngos); return $ngos; }
function crawlAgents($pageUrl, $domObj) { $html = scraperwiki::scrape($pageUrl); $domObj->load($html); $html = null; $table = $domObj->find('/html/body/table[5]'); foreach ($table[0]->find('tr') as $trs) { if (strpos($trs->firstChild()->plaintext, " String ") == false) { $tds = $trs->find('td'); $agentstring = str_replace(' ', '', $tds[0]->plaintext); $agentdescription = str_replace(' ', '', $tds[1]->plaintext); $agenttype = str_replace(' ', '', $tds[2]->plaintext); $record = array('agent' => $agentstring, 'description' => $agentdescription, 'agent_type' => $agenttype); scraperwiki::save_sqlite(array('agent'), $record, $table_name = "UserAgents"); } } }
function getStats($language) { $url = "http://mappings.dbpedia.org/server/statistics/{$language}/"; $html = scraperwiki::scrape($url); $dom = new simple_html_dom(); $dom->load($html); $i = 0; $fieldNames = array("templates", "template_occurrences", "property_occurrences"); $stats = array(); foreach ($dom->find('/html/body/p') as $result) { $paragraph = $result->plaintext; $parsed = explode("\n", $paragraph); $percentage = (double) trim($parsed[1]); $f = $fieldNames[$i++]; // one of $fieldNames $stats[$f] = $percentage; } return $stats; }
function listProperty($host, $searchURL) { $property = array(); $html_content = scraperwiki::scrape($host . $searchURL); $html = str_get_html($html_content); //$el = $html->find("div.bien_description h3"); //print_r($el); //$property['description'] = $el; foreach ($html->find("ul.galerie_photo a") as $el) { $imageURL = $el->href; $property['images'][] = $imageURL; //echo "\nIMAGE : " . $imageURL; } usleep(1000000); //echo "\n"; //print json_encode($property); $property = array('property' => json_encode($property)); return $property; }
function getProducts($u) { global $baseurl, $o, $local; $path = ""; $d = new simple_html_dom(); $d->load(scraperwiki::scrape($u)); //echo "Loaded URL: " . $u . "\n"; $S2Prod = $d->find('span[class=S2Product]'); if (count($S2Prod) > 0) { foreach ($S2Prod as $p) { $sku = trim($p->find('div[class=S2ProductSku]', 0)->innertext, "# "); $prodname = trim($p->find('div[class=S2ProductName]', 0)->first_child()->innertext); $prodthumb = $p->find('img[class=S2ProductImg]', 0)->src; $prodURL = $p->find('div[class=S2ProductName]', 0)->first_child()->href; fputcsv($o, array($sku, $prodname, $prodthumb, $prodURL)); echo $prodname . "\n"; } if ($d->find('div[class=S2itemsPPText]', 0)->last_child()->style == "display: inline") { $newURL = $baseurl . $d->find('div[class=S2itemsPPText]', 0)->last_child()->href; getProducts($newURL); } } }
function scrape_stations($url) { $html_content = scraperwiki::scrape($url); $html = str_get_html($html_content); foreach ($html->find('tr.result') as $result) { $logo = $result->find('img.logo', 0)->src; if (preg_match('/\\d{4,6}/', $result->find('a.play', 0)->href, $links)) { $stationID = $links[0]; $station = $result->find('td.show a', 0)->plaintext; $tagline = $result->find('td.show span', 0)->plaintext; $location = $result->find('.location', 0)->plaintext; scraperwiki::save_sqlite(array('stationID'), array('stationID' => $stationID, 'station' => $station, 'tagline' => trim($tagline, " -"), 'location' => $location, 'logo' => $logo), 'stations'); foreach ($result->find('.genres a') as $genre) { scraperwiki::save_sqlite(array(), array('stationID' => $stationID, 'genre' => $genre->plaintext), 'genres'); } } } if ($next = $html->find('a.next', 0)) { if (preg_match("/(?<=location\\.href=')[^']*/", $next->outertext, $matches)) { scrape_stations('http://tunein.com' . $matches[0]); } } }
function parseBusinessPage($business_id) { $business_link = "http://digitalcircle.org/businesses/" . $business_id; $page = scraperwiki::scrape($business_link); $business = new simple_html_dom(); $business->load($page); $business_array = array(); $business_array['business_id'] = $business_id; $business_array['name'] = $business->find("article.business header h1.org", 0)->plaintext; $business_array['desc'] = $business->find("article.business section.description", 0)->plaintext; $business_array['skills'] = ""; foreach ($business->find("#skills ul li a") as $skill) { $business_array['skills'] = $business_array['skills'] . $skill->plaintext . ":/:"; } $sector_url = $business->find("dd a[href*=sector]", 0)->href; if ($sector_url != "") { $sector = explode("sector=", $sector_url); $business_array['sector'] = array_pop($sector); } $business_array['url'] = $business->find("ul#contact a.url", 0)->href; $business_array['location'] = $business->find("aside.sidebar span.locality", 0)->plaintext; scraperwiki::save_sqlite(array('business_id'), $business_array); }
function scrape() { echo "Loading data ...\n"; $dom = new DOMDocument(); @$dom->loadHTML(scraperwiki::scrape('http://www.bmfbovespa.com.br/en-us/markets/equities/companies/companies-with-tag-along-rights.aspx?idioma=en-us')); $xpath = new DOMXPath($dom); $dom = null; unset($dom); $rows = $xpath->query('//div[@class="tabela"]/table/tbody/tr'); $n = $rows->length; $xpath = null; unset($xpath); $results = array(); for ($i = 0; $i < $n; $i++) { $row = $rows->item($i)->getElementsByTagName('td'); array_push($results, array('unique_id' => preg_replace('/\\s+/', '', trim(@$row->item(0)->nodeValue) . '-' . trim(@$row->item(2)->nodeValue)), 'name' => trim(@$row->item(0)->nodeValue), 'corporate_resolution' => trim(@$row->item(1)->nodeValue), 'event_date' => trim(@$row->item(2)->nodeValue), 'tag_voting_pct' => trim(@$row->item(3)->nodeValue), 'tag_non_voting_pct' => trim(@$row->item(4)->nodeValue), 'listing_segment' => trim(@$row->item(5)->nodeValue))); } scraperwiki::save_sqlite(array('unique_id'), $results); echo "Peak memory usage: " . memory_get_peak_usage() . "\n"; $query = null; $results = null; unset($query); unset($results); }
function getProducts($url, $path) { global $p, $c, $baseurl; $c->load(scraperwiki::scrape($baseurl . $url)); echo "Looking for products in " . $path . "\n"; $prods = $c->find('div.product2014item'); if (count($prods) == 0) { echo "No products found at " . $url . "\n"; } else { foreach ($prods as $prod) { if (strpos($prod->class, "product2014cattab") === FALSE) { if (!is_null($prod->find('a', 0))) { $prodname = $prod->find('a.product_link > div', 0)->innertext; $produrl = $prod->find('a', 0)->href; fputcsv($p, array($prodname, $path, $produrl)); echo "Saved product: " . $prodname . "\n"; } } } if (!is_null($c->find('div.pagnbtn', 0))) { getProducts($c->find('div.pagnbtn > a', 0)->href, $path); } } }
scraperwiki::save(array('Link'), array('Link' => $name)); } } require 'scraperwiki/simple_html_dom.php'; //MUSEUM /* //museum for($i=764; $i<=49; $i++){ print $i."\n"; $html = scraperwiki::scrape("http://www.mamilade.de/kinder/2006700-4---1317074400-$i-1324941496.html"); # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); */ //gastro for ($i = 1; $i <= 765; $i++) { print $i . "\n"; $html = scraperwiki::scrape("http://www.mamilade.de/gastronomie/2024700-4---1317074400-{$i}-1324976513.html"); # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); //LINK foreach ($dom->find('a.headline400') as $name) { # Store data in the datastore $name = $name->href; //print $name. "\n"; scraperwiki::save(array('Link'), array('Link' => $name)); } }
function create_dataset2($html) { $i = 0; $dom = new simple_html_dom(); $dom->load($html); #controllo se esiste veramente prima di entrare $table = $dom->find('table', 2); if (isset($table)) { foreach ($dom->find('table', 2)->children() as $data) { echo "parsing info tabella principale"; if ($data != null) { $res = trim($data->plaintext); } if ($i > 0 && strlen($res) > 0) { # Store data in the datastore #print $res; $res = str_replace(''', "'", $res); #splitto i risultati in un array $array_result = split(' ', $res); #print_r($res); #echo $denom; # Mi salvo il codiceMPI $codMPI = trim($array_result[1]); $url_MPI = "http://www.trampi.istruzione.it/ricScu/dettaglio.do?cod=" . $codMPI; #print $url_MPI."\n"; $html = scraperwiki::scrape($url_MPI); $dom_mpi = new simple_html_dom(); $dom_mpi->load($html); $tel = ""; $fax = ""; $email = ""; $web = ""; $indS = ""; $tr = $dom_mpi->find('table[cellspacing=1] tr'); if (isset($tr)) { foreach ($dom_mpi->find('table[cellspacing=1] tr') as $data_mpi) { $res = $data_mpi->plaintext . "\n"; $values = split(':', $res); #print_r($values); if (strlen($values[0]) > 0) { if (stripos($values[0], 'tel') !== false) { $tel = trim($values[1]); #print "tel:".$tel."\t"; } else { if (stripos($values[0], 'fax') !== false) { $fax = trim($values[1]); #print "fax:".$fax."\t"; } else { if (stripos($values[0], 'e-mail') !== false) { $email = trim($values[1]); } else { if (stripos($values[0], 'web') !== false) { while (list($key, $value) = each($values)) { if ($key = 2) { $web = $values[1] . ":" . $value; } } } else { if (stripos($values[0], 'studio') !== false) { $indS = str_replace('</td>', '', $values[1]); $indS = str_replace('</tr>', '', $indS); $indS = str_replace(array("\r", "\t", "\n"), '', $indS); $indS = trim($indS); #print "ind studio:".$indS."\n"; } } } } } #echo $web."\n"; } } unset($values); } $dom_mpi->clear(); unset($dom_mpi); $dataset = array('denominazione' => trim(html_entity_decode($array_result[0])), 'codiceMPI' => trim($array_result[1]), 'tipologia' => trim(html_entity_decode($array_result[2])), 'tipologiaIIgrado' => trim(html_entity_decode($array_result[3])), 'descrizione' => trim(html_entity_decode($array_result[4])), 'indirizzo' => trim(html_entity_decode($array_result[5])), 'località' => trim(html_entity_decode($array_result[6])), 'cap' => trim($array_result[7]), 'comune' => trim(html_entity_decode($array_result[8])), 'provincia' => trim(html_entity_decode($array_result[9])), 'regione' => trim(html_entity_decode($array_result[10])), 'codIstitutoComprensivo' => trim(html_entity_decode($array_result[11])), 'telefono' => $tel, 'fax' => $fax, 'email' => $email, 'web' => $web, 'IndirizziStudio' => trim(html_entity_decode($indS))); #print_r($dataset); #scraperwiki::save(array('data'), array('data' => $data->plaintext)); if (strlen($dataset['denominazione']) > 1) { scraperwiki::save(array('denominazione', 'codiceMPI'), $dataset); } unset($dataset); unset($res); unset($tel); unset($fax); unset($email); unset($web); unset($indS); } $i = $i + 1; } #dealloco il dom sennò schianta $dom->clear(); unset($dom); } }
<?php $html = scraperwiki::scrape("http://www.bbc.co.uk/radio4/factual/desertislanddiscs_archive.shtml"); //print $html . "\n"; print "hello\n"; print "goodbye\n"; $html = scraperwiki::scrape("http://www.bbc.co.uk/radio4/factual/desertislanddiscs_archive.shtml"); //print $html . "\n"; print "hello\n"; print "goodbye\n";
require 'scraperwiki/simple_html_dom.php'; $html_content = scraperwiki::scrape("http://eventful.com/events?geo=region_id:984"); $html = str_get_html($html_content); // Fetch page $file = fopen($url, "r"); $data = ''; while (!feof($file)) { // Extract the data from the file / url $data .= fgets($file, 1024); } $doc = new DOMDocument(); $doc->loadHtml($data); // XPath lets you search DOM documents easily $xpath = new DOMXPath($doc); $nodelist = $xpath->query('//table[class=mytable]'); require 'scraperwiki/simple_html_dom.php'; $html_content = scraperwiki::scrape("http://eventful.com/events?geo=region_id:984"); $html = str_get_html($html_content); // Fetch page $file = fopen($url, "r"); $data = ''; while (!feof($file)) { // Extract the data from the file / url $data .= fgets($file, 1024); } $doc = new DOMDocument(); $doc->loadHtml($data); // XPath lets you search DOM documents easily $xpath = new DOMXPath($doc); $nodelist = $xpath->query('//table[class=mytable]');
$i++; } print "data: " . $alldata . "\n----------------\n"; $entry['Term'] = $datah2X; $entry['Definition'] = $alldata; scraperwiki::save(array('Definition'), $entry); } ###################################### # Basic PHP scraper ###################################### require 'scraperwiki/simple_html_dom.php'; //print $html; $base_url = "http://www.euskara.euskadi.net/r59-15172x/eu/hizt_el/emaitza.asp?"; $pages_to_scrape = array("azpisar=giltzurrun+gaineko+guruin+guruin+suprarrenal&sarrera=guruin&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutz&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzada&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzadura&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzagune&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzaketa&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzaldi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzaldiabiadura+gurutzaldierregimen&sarrera=gurutzaldi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzaldiabiadura+gurutzaldierregimen&sarrera=gurutzaldi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzaldiestropada&sarrera=gurutzaldi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzaldimisil&sarrera=gurutzaldi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzatu++1&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzatu++2&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutze++1&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutze++2&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutze+santuaren+seinalea+egin&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutze+santuaren+seinalea+egin&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzearen+seinale&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzearen+seinalea+egin&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzebide&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzedun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzefika&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzefikapen&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzefikatu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzefikatzaile&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzeganga&sarrera=ganga++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzegrama&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzeontzi&sarrera=gurutze++2&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzepuntu&sarrera=puntu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzeria&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzeta&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gurutzetaezpata&sarrera=gurutzeta&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutzetako&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutziltzaile&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutziltzaketa&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gurutziltzatu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guruzpide&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guruztoki&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustagarri&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustatu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustavo&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustudun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustugabe&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustugabetasun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gustuko&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gustura&sarrera=gustu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gusu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutapertxa&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutar++1&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutar++2&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutaratu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutartean&sarrera=gu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutarteko&sarrera=gu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guti&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutixko&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutiz+gehienak&sarrera=guti&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutizia&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutiziagarri&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutiziamendu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutiziatsu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutiziatu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutiziatzaile&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutizioso&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutizioso&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutun+pastoral&sarrera=pastoral&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutunazal&sarrera=gutun&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutunirekitzeko&sarrera=gutun&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutunliburu&sarrera=gutun&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutunontzi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxi+balitz+bezala&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxi+bat&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxi+gorabehera&sarrera=gorabehera&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiago&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxiago+izan&sarrera=gutxiago&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiagotasun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiagotu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxiasko&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiegi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiegitasun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxien&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxien+uste+denean+erbia+azaldu&sarrera=erbi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxien+uste+dugun+lekuan+erbia+lo&sarrera=erbi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxienean&sarrera=gutxien&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxieneko&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxieneko+zerbitzu&sarrera=zerbitzu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxienekoa+izan&sarrera=gutxieneko&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxienez&sarrera=gutxien&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiengo&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxienik&sarrera=gutxien&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiesgarri&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxiespen&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxietsi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxigarri&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxigatik&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxik+egin+du++ez+enean%2Fbait%2Ftzea%2Fnola&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxika&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxika%2Fxeheka+saldu&sarrera=saldu&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxitan&sarrera=gutxi&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxitu++1&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxitu++2&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxitxo&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=gutxixeago&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=gutxiz+gehiena&sarrera=gehien&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guyana&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guyana+frantsesa&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guyana+nederlandarra&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guyanar&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guzi&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guzti&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztiahaldun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztiahalduntasun&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztiahalmen&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztiahaltsu&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=guztiarekin+ere&sarrera=guzti&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "azpisar=guztiaz+ere&sarrera=guzti&mota=azpisarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztira&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztitara&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztiz&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza=", "sarrera=guztizko&mota=sarrera&term_hizkuntza=E&aplik_hizkuntza="); foreach ($pages_to_scrape as $page) { $html = scraperwiki::scrape($base_url . $page); $sections_dom = new simple_html_dom(); $sections_dom->load($html); $datah2X = ''; foreach ($sections_dom->find('h2 span.azpisarrera') as $datah2) { $datah2X = utf8_encode($datah2->plaintext); print "h2: " . $datah2X . "\n"; } if (!isset($datah2X) || $datah2X == '') { foreach ($sections_dom->find('h1 span') as $datah2) { $datah2X = utf8_encode($datah2->plaintext); print "h1: " . $datah2X . "\n"; } } $alldata = ''; $i = 0;
$lea = "lea"; print $lea; $councillors["{$name}"] = array("LEA" => $lea, "Party" => $moredetails["party"]); } unset($dom, $html, $uri); scraperwiki::sqliteexecute("drop table councillors"); scraperwiki::sqliteexecute("create table if not exists councillors (`auth` string, `lea` string, `name` string, `party` string)"); #, `email` string, `address` string, `phone` string, `mobile` string, `image` string)"); scraperwiki::sqlitecommit(); foreach ($councillors as $name => $values) { scraperwiki::sqliteexecute("insert or replace into councillors values (:auth, :lea, :name, :party)", array("auth" => "Carlow County Council", "lea" => $values["LEA"], "name" => $name, "party" => $values["Party"])); } scraperwiki::sqlitecommit(); $council = "Carlow County Council"; $uri = "http://www.carlow.ie/councillors/Pages/carlow-county-councillors.aspx"; $html = scraperwiki::scrape($uri); $councillors = array(); $moredetails = array(); require 'scraperwiki/simple_html_dom.php'; $dom = new simple_html_dom(); $dom->load($html); #$rows=$dom->find('div[class="item hline"]'); #print $row; # as $cell) { $content = $dom->find("div[class=content]"); //print_r($content); //$content = $content[0]; print_r($content); # item hline details //foreach($dom->find('div.item.hline') as $cell) { //$content->find('h1') as $cell) { # [class=councillor]
<?php ###################################### # Basic PHP scraper ###################################### require 'scraperwiki/simple_html_dom.php'; $html = scraperwiki::scrape("http://scraperwiki.com/hello_world.html"); print $html; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('td') as $data) { # Store data in the datastore print $data->plaintext . "\n"; scraperwiki::save(array('data'), array('data' => $data->plaintext)); } ###################################### # Basic PHP scraper ###################################### require 'scraperwiki/simple_html_dom.php'; $html = scraperwiki::scrape("http://scraperwiki.com/hello_world.html"); print $html; # Use the PHP Simple HTML DOM Parser to extract <td> tags $dom = new simple_html_dom(); $dom->load($html); foreach ($dom->find('td') as $data) { # Store data in the datastore print $data->plaintext . "\n"; scraperwiki::save(array('data'), array('data' => $data->plaintext)); }