function parse_array($arr) { foreach ($arr as $key => $value) { $db = DB::select()->from('categories')->where('id', '=', $key)->execute(); echo '<ul><li>'; echo '<b>Название:</b> ' . form::input('name[]', $db[0]['name'], array('onkeyup' => 'translit(\'name[]\', \'url[]\');')) . form::hidden('cid[]', $db[0]['id']) . ' '; echo '<b>URL:</b> ' . form::input('url[]', $db[0]['url']) . ' '; echo '<b>Удалить?</b> ' . form::checkbox('delete[]', $db[0]['id']); echo '</li>'; if (is_array($value)) { parse_array($value); } echo '</ul>'; } }
function __construct($parse_text_k=null, $parse_text_v=null) { if(is_array($parse_text_k)) { $this->data=$parse_text_k; } elseif($parse_text_k==null) { $this->data=array(); } else { $ks=parse_array($parse_text_k); $vs=parse_array($parse_text_v); $this->data=array(); for($i=0; $i<sizeof($ks);$i++) { $this->data[$ks[$i]]=$vs[$i]; } } }
function download_parse_rss($target) { # download tge rss page $news = http_get($target, ""); # Parse title & copyright notice $rss_array['TITLE'] = return_between($news['FILE'], "<title>", "</title>", EXCL); $rss_array['COPYRIGHT'] = return_between($news['FILE'], "<copyright>", "</copyright>", EXCL); # Parse the items $item_array = parse_array($news['FILE'], "<item>", "</item>"); for ($xx = 0; $xx < count($item_array); $xx++) { $rss_array['ITITLE'][$xx] = return_between($item_array[$xx], "<title>", "</title>", EXCL); $rss_array['ILINK'][$xx] = return_between($item_array[$xx], "<link>", "</link>", EXCL); $rss_array['IDESCRIPTION'][$xx] = return_between($item_array[$xx], "<description>", "</description>", EXCL); $rss_array['IPUBDATE'][$xx] = return_between($item_array[$xx], "<pubDate>", "</pubDate>", EXCL); } return $rss_array; }
function get_encoding($strHTML, $header = false) { $enc = ""; if ($header) { //try to pull encoding from header information $strHeader = substr($strHTML, 0, strpos($strHTML, "<")); //looking for line Content-Type: text/html; charset=utf-8 $pos = strpos($strHeader, "charset="); if ($pos !== FALSE) { $pos2 = strpos($strHeader, "\n", $pos); $enc = substr($strHeader, $pos + 8, $pos2 - $pos - 8); } } #$head_section = return_between($string=$strHTML, $start="<head>", $end="</head>", $type=EXCL); # Create an array of all the meta tags $meta_tag_array = parse_array($strHTML, $beg_tag = "<meta", $close_tag = ">"); $new_page = ""; # Examine each meta tag for a redirection command for ($xx = 0; $xx < count($meta_tag_array); $xx++) { # Look for http-equiv attribute $meta_attribute = get_attribute($meta_tag_array[$xx], $attribute = "http-equiv"); #echo $meta_tag_array[$xx] . "\n"; if (strtolower($meta_attribute) == "content-type") { #echo "HERE!"; $new_page = return_between($meta_tag_array[$xx], $start = "charset", $end = ">", $type = EXCL); # Clean up URL $new_page = trim(str_replace("", "", $new_page)); $new_page = str_replace("/", "", $new_page); $new_page = str_replace("=", "", $new_page); $new_page = str_replace("\"", "", $new_page); $new_page = str_replace("'", "", $new_page); $new_page = str_replace(" ", "", $new_page); break; } } if (strlen($enc) > 0 && strlen($new_page) > 0) { return $enc . "," . $new_page; } elseif (strlen($new_page) > 0) { return $new_page; } elseif (strlen($enc) > 0) { return $enc; } else { return ""; } }
public function getInfo() { $rawInfo = self::getRaw($this->action, $this->ref, $this->data); //import('SHD.simple_html_dom'); $infoArray = parse_array($rawInfo['FILE'], '<tr>', '</tr>'); $tr = []; $this->page = end($infoArray); foreach ($infoArray as $k => $v) { $tb = parse_array($v, '<td', '</td>'); $temp = get_attribute($tb[7], 'href'); $temp = split_string($temp, 'proj_idDes=', AFTER, EXCL); $tb[7] = remove($tb[7], '<a', '>'); $tb[7] = remove($tb[7], '</a', '>'); $temp2 = $tb[7]; $tb[7] = []; $tb[7][] = $temp2; $tb[7][] = $temp; $temp4 = get_attribute($tb[8], 'href'); $temp4 = split_string($temp4, 'proj_idDes=', AFTER, EXCL); $tb[8] = remove($tb[8], '<a', '>'); $tb[8] = remove($tb[8], '</a', '>'); $temp3 = $tb[8]; $tb[8] = []; $tb[8][] = $temp3; $tb[8][] = $temp4; $tr[] = $tb; //parse_array($v,'<td','</td>'); //var_dump($tr); } /* [4]=> array(9) { [0]=> string(37) "3" [1]=> string(30) "信息" [2]=> string(49) "201510613089 " [3]=> string(78) "基于人体肢体语言的机械臂操控" [4]=> string(62) "15国家创新训练项目" [5]=> string(45) "张翠芳" [6]=> string(185) "20132235 刘炳楠 20132312 覃勇杰 20132230 李晓芳 20132169 涂敏 " [7]=> string(145) "查看 " [8]=> string(158) " 成果展" } */ /*$html=new simple_html_dom(); $html->load($rawInfo["FILE"]); //var_dump($html); //return $rawInfo; $infoArray = $html->find('tr');*/ return $tr; }
function harvest_links($url) { # Initialize global $DELAY; $link_array = array(); # Get page base for $url $page_base = get_base_page_address($url); # Download webpage sleep($DELAY); $downloaded_page = http_get($url, ""); $anchor_tags = parse_array($downloaded_page['FILE'], "<a", "</a>", EXCL); # Put http attributes for each tag into an array for ($xx = 0; $xx < count($anchor_tags); $xx++) { $href = get_attribute($anchor_tags[$xx], "href"); $resolved_addres = resolve_address($href, $page_base); $link_array[] = $resolved_addres; echo "Harvested: " . $resolved_addres . " \n"; } return $link_array; }
function describe_zipcode($zipcode) { # Get required libraries and declare the target include "../util/LIB_http.php"; include "../util/LIB_parse.php"; $target = "http://www.schrenk.com/nostarch/webbots/zip_code_form.php"; # Download the target $page = http_get($target, $ref = ""); # Parse the session hidden tag from the downloaded page # <input type="hidden" name="session" value="xxxxxxxxxx"> $session_tag = return_between($string = $page['FILE'], $start = "<input type=\"hidden\" name=\"session\"", $end = ">", $type = EXCL); # Remove the "'s and "value=" text to reveal the session value $session_value = str_replace("\"", "", $session_tag); $session_value = str_replace("value=", "", $session_value); # Submit the form $data_array['session'] = $session_value; $data_array['zipcode'] = $zipcode; $data_array['Submit'] = "Submit"; $form_result = http_post_form($target, $ref = $target, $data_array); $landmark = "Information about " . $zipcode; $table_array = parse_array($form_result['FILE'], "<table", "</table>"); for ($xx = 0; $xx < count($table_array); $xx++) { # Parse the table containing the parsing landmark if (stristr($table_array[$xx], $landmark)) { $ret['CITY'] = return_between($table_array[$xx], "CITY", "</tr>", EXCL); $ret['CITY'] = strip_tags($ret['CITY']); $ret['STATE'] = return_between($table_array[$xx], "STATE", "</tr>", EXCL); $ret['STATE'] = strip_tags($ret['STATE']); $ret['COUNTY'] = return_between($table_array[$xx], "COUNTY", "</tr>", EXCL); $ret['COUNTY'] = strip_tags($ret['COUNTY']); $ret['LATITUDE'] = return_between($table_array[$xx], "LATITUDE", "</tr>", EXCL); $ret['LATITUDE'] = strip_tags($ret['LATITUDE']); $ret['LONGITUDE'] = return_between($table_array[$xx], "LONGITUDE", "</tr>", EXCL); $ret['LONGITUDE'] = strip_tags($ret['LONGITUDE']); } } # Return the parsed data return $ret; }
public function index() { include "application/libraries/LIB_http.php"; include "application/libraries/LIB_parse.php"; $ref = "http://www.wenku8.cn"; $method = "GET"; $this->load->model("insertmodel"); $success = "Catch OK"; for ($xx = 1; $xx < 1700; $xx++) { $target = 'http://www.wenku8.cn/wap/article/packshow.php?id=' . $xx . '&type=txtfull'; $web_page = http_get($target, $ref); //novel_name $label = '<card'; $meta_tag_array = parse_array($web_page['FILE'], $label, ">"); $meta_tag_array = str_replace(" ", "", $meta_tag_array); $meta_tag_array = str_replace("-", "", $meta_tag_array); $meta_tag = split_string($meta_tag_array[0], "title=\"", AFTER, EXCL); $novel_name = strip_tags(split_string($meta_tag, "TXT", BEFORE, EXCL)); //update_time preg_match_all("/\\d{4}-\\d{1,2}-\\d{1,2}/", @$web_page['FILE'], $matches_array); foreach ($matches_array[0] as $key => $value) { $update_time = $value; } echo $update_time; //size preg_match_all("/\\d+K/", @$web_page['FILE'], $get_array); foreach ($get_array[0] as $key => $value) { $size = $value; } $data = array('novel_id' => $xx, 'novel_name' => empty($novel_name) ? '没有这本小说' : $novel_name, 'update_time' => $update_time, 'size' => $size . K); //print_r($data); echo "<br>"; $this->insertmodel->insert_Novel($data); //echo $data; } $this->load->view('curl_result', $success); }
curl_setopt($ch, CURLOPT_REFERER, $file); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); curl_setopt($ch, CURLOPT_MAXREDIRS, 4); $result = curl_exec($ch); curl_close($ch); return $result; } $result = array(); $url = filter_input(INPUT_POST, "input-link"); $last_position = strripos($url, '/'); $last_position += 1; $url_dir = substr($url, 0, $last_position); $web_page = http_get($url, $ref = ""); $web_page = $web_page["FILE"]; $a_tag = parse_array($web_page, "<a ", ">"); $count = 0; foreach ($a_tag as $val) { if (stristr($val, "playAudioBarPlay")) { $href_str = trim(get_attribute($val, "href")); $href_str = str_replace("'", "", $href_str); $href_str = str_replace("(", "", $href_str); $href_str = str_replace(")", "", $href_str); $href_arr = explode(",", $href_str); $href_str = str_replace("javascript:playAudioBarPlay", "", $href_arr[0]); if (!file_exists("audio/" . $href_str)) { $audio_file = download_audio($url_dir . 'aumpo/' . $href_str); file_put_contents("audio/" . $href_str, $audio_file); $file_size = filesize("audio/" . $href_str, $audio_file); if ($file_size === false || $file_size <= 0) { $result["result"] = "error download audio form link";
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION. The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders. Copyright 2007, Michael Schrenk THIS SCRIPT IS FOR DEMONSTRATION PURPOSES ONLY! It is not suitable for any use other than demonstrating the concepts presented in Webbots, Spiders and Screen Scrapers. ######################################################################## */ ?> <?php #PHP_LIBRARY_PATH $PHP_LIBRARY_PATH = "../phplibs"; include "{$PHP_LIBRARY_PATH}/LIB_parse.php"; # Include parse library include "{$PHP_LIBRARY_PATH}/LIB_http.php"; # Include cURL library $web_page = http_get($target = "http://www.fbi.gov", $referer = ""); $meta_tag_array = parse_array($web_page['FILE'], "<meta", ">"); for ($xx = 0; $xx < count($meta_tag_array); $xx++) { echo $meta_tag_array[$xx] . "\n"; }
function parse_array($arr) { if (is_array($arr)) { //folder/clip foreach ($arr as $name => $sub_arr) { if ($name != $sub_arr['name']) { //folder echo "<a onclick='Toggle(this);' style='display: block; cursor: pointer; text-decoration: none;'><img src='resources/images/icon_folder.png' border='none' align='absmiddle' style='margin: 1px 2px 3px 0px;'>" . $name . "</a>"; echo "<div style='display: none; padding-left: 16px;'>\n"; parse_array($sub_arr); echo "</div>\n"; } else { //clip echo "<div style='white-space: nowrap;'>\n"; echo "<a href='javascript:void(0);' onclick=\"parent.document.getElementById('clip_uuid').value='" . $sub_arr['uuid'] . "'; parent.document.getElementById('clip_name').value='" . $sub_arr['name'] . "';\">"; echo "<img src='resources/images/icon_file.png' border='0' align='absmiddle' style='margin: 1px 2px 3px -1px;'>"; echo $sub_arr['name']; echo "</a>\n"; echo "<textarea style='display: none' id='before_" . $sub_arr['uuid'] . "'>" . $sub_arr['before'] . "</textarea>\n"; echo "<textarea style='display: none' id='after_" . $sub_arr['uuid'] . "'>" . $sub_arr['after'] . "</textarea>\n"; echo "</div>\n"; } } } }
function download_images_for_page($target) { echo "target = {$target}\n"; # Download the web page $web_page = http_get($target, $referer = ""); # Update the target in case there was a redirection $target = $web_page['STATUS']['url']; # Strip file name off target for use as page base $page_base = get_base_page_address($target); # Identify the directory where iamges are to be saved $save_image_directory = "saved_images_" . str_replace("http://", "", $page_base); # Parse the image tags $img_tag_array = parse_array($web_page['FILE'], "<img", ">"); if (count($img_tag_array) == 0) { echo "No images found at {$target}\n"; exit; } # Echo the image source attribute from each image tag for ($xx = 0; $xx < count($img_tag_array); $xx++) { $image_path = get_attribute($img_tag_array[$xx], $attribute = "src"); echo " image: " . $image_path; $image_url = resolve_address($image_path, $page_base); if (get_base_domain_address($page_base) == get_base_domain_address($image_url)) { # Make image storage directory for image, if one doesn't exist $directory = substr($image_path, 0, strrpos($image_path, "/")); $directory = str_replace(":", "-", $directory); $image_path = str_replace(":", "-", $image_path); clearstatcache(); // clear cache to get accurate directory status if (!is_dir($save_image_directory . "/" . $directory)) { mkpath($save_image_directory . "/" . $directory); } # Download the image, report image size $this_image_file = download_binary_file($image_url, $ref = ""); echo " size: " . strlen($this_image_file); # Save the image if (stristr($image_url, ".jpg") || stristr($image_url, ".gif") || stristr($image_url, ".png")) { $fp = fopen($save_image_directory . "/" . $image_path, "w"); fputs($fp, $this_image_file); fclose($fp); echo "\n"; } } else { echo "\nSkipping off-domain image.\n"; } } }
function __construct($data) { parent::__construct($data); if(is_string($this->data[member_ids])) { $this->data[member_ids]=parse_array($this->data[member_ids]); $this->data[member_roles]=parse_array($this->data[member_roles]); load_objects($this->data[member_ids]); foreach($this->data[member_ids] as $i=>$mem) { $obj=load_object($mem); $this->members[]=array($obj, $this->data[member_roles][$i]); } } }
$indirectTotalIndirectRouteDistanceArray = array(); $indirectTotalRouteDistanceArray = array(); $numRouteCount = 0; //parse the routes foreach ($routes as $route) { $numRouteCount++; $indirectRouteErrorCodeArray = parse_array($indirectRoute, "<ErrorCode>", "</ErrorCode>"); $indirectStartStopsArray = parse_array($indirectRoute, "<StartStop>", "</StartStop>"); $indirectStartBusesArray = parse_array($indirectRoute, "<StartBuses>", "</StartBuses>"); $indirectFirstJunctionsArray = parse_array($indirectRoute, "<FirstJunction>", "</FirstJunction>"); $indirectDistanceBetweenJunctionArray = parse_array($indirectRoute, "<DistanceBetweenJunction>", "</DistanceBetweenJunction>"); $indirectSecondJunctionsArray = parse_array($indirectRoute, "<SecondJunction>", "</SecondJunction>"); $indirectEndBusesArray = parse_array($indirectRoute, "<EndBuses>", "</EndBuses>"); $indirectEndStopsArray = parse_array($indirectRoute, "<EndStop>", "</EndStop>"); $indirectTotalIndirectRouteDistanceArray = parse_array($indirectRoute, "<TotalIndirectRouteDistance>", "</TotalIndirectRouteDistance>"); $indirectTotalRouteDistanceArray = parse_array($indirectRoute, "<TotalRouteDistance>", "</TotalRouteDistance>"); } for ($i = 0; $i < $numRouteCount; $i++) { if ($i == 0) { echo "StartStop:" . $startStop . "," . "EndStop:" . $endStop . ",depotErrorCode:" . intval($depotErrorCode) . "n"; } echo "<br/>"; //echo "Route#".$i; echo "ErrorCode:" . intval(return_between($indirectRouteErrorCodeArray[$i], "<ErrorCode>", "</ErrorCode>", EXCL)) . "\t"; //echo "StartStop:".$indirectStartStopsArray[$i]."\t"; //echo "StartBuses:".$indirectStartBusesArray[$i]."\t"; //echo "FirstJunction:".$indirectFirstJunctionsArray[$i]."\t"; echo "DistanceBetweenJucntion:" . $indirectDistanceBetweenJunctionArray[$i] . "\t"; //echo "SecondJunction:".$indirectSecondJunctionsArray[$i]."\t"; //echo "EndBuses:".$indirectEndBusesArray[$i]."\t"; //echo "EndStop:".$indirectEndStopsArray[$i]."\t";
<?php # Inlcude http and parse libraries include "../util/LIB_http_browser.php"; include "../util/LIB_parse.php"; include "../util/LIB_resolve_addresses.php"; include "../util/LIB_http_codes.php"; # Identify the target web page and the page base $target = "http://www.schrenk.com/nostarch/webbots/page_with_broken_links.php"; $page_base = "http://www.schrenk.com/nostarch/webbots/"; # Download the web page $downloaded_page = http_get($target, $ref = ""); # Parse the links $link_array = parse_array($downloaded_page['FILE'], $beg_tag = "<a", $close_tag = ">"); # Verify the links ?> <table border="1" cellpadding="1" cellspacing="0"> <tr bgcolor="#e0e0e0"> <th>URL</th> <th>HTTP CODE</th> <th>DOWNLOAD TIME (seconds)</th> </tr> <?php for ($xx = 0; $xx < count($link_array); $xx++) { // Parse the http attribute from link $link = get_attribute($tag = $link_array[$xx], $attribute = "href"); // Create a fully resolved address $resloved_link_address = resolve_address($link, $page_base); $downloaded_link = http_get($resloved_link_address, $target);
$indirectDepotArray = array(); $indirectDistanceBetweenDepotAndStartStopArray = array(); $indirectDistanceBetweenDepotAndEndStopArray = array(); $indirectEndBusesArray = array(); $indirectEndStopsArray = array(); $indirectTotalRouteDistanceArray = array(); $routes = parse_array($str, "<Route>", "</Route>"); $numRouteCount = 0; foreach ($routes as $route) { $numRouteCount++; $indirectRouteErrorCodeArray = parse_array($str, "<ErrorCode>", "</ErrorCode>"); $indirectStartStopsArray = parse_array($str, "<StartStop>", "</StartStop>"); $indirectStartBusesArray = parse_array($str, "<StartBuses>", "</StartBuses>"); $indirectDepotArray = parse_array($str, "<FirstJunction>", "</FirstJunction>"); $indirectEndBusesArray = parse_array($str, "<EndBuses>", "</EndBuses>"); $indirectEndStopsArray = parse_array($str, "<EndStop>", "</EndStop>"); } for ($i = 0; $i < $numRouteCount; $i++) { $depotNameString = return_between($indirectDepotArray[$i], "<FirstJunction>", "</FirstJunction>", EXCL); list($depotName, $lat, $lon) = explode(":", $depotNameString); $distanceBetweenDepotAndStartStop = distanceBetweenStops($startStop, $depotName); $distanceBetweenDepotAndEndStop = distanceBetweenStops($endStop, $depotName); $totalRouteDistance = floatval($distanceBetweenDepotAndStartStop) + floatval($distanceBetweenDepotAndEndStop); $sql = "INSERT INTO directdepotbusroutes (DepotErrorCode,\r\n\t\t\t\t\t\t\t\t\t\t\t\t StartStop,\r\n\t\t\t\t\t\t\t\t\t\t\t\t EndStop,\r\n\t\t\t\t\t\t\t\t\t\t\t\t DepotNameString,\r\n\t\t\t\t\t\t\t\t\t\t\t\t BusesBetweenStartStopAndDepot,\r\n\t\t\t\t\t\t\t\t\t\t\t\t BusesBetweenEndStopAndDepot,\r\n\t\t\t\t\t\t\t\t\t\t\t\t DistanceBetweenDepotAndStartStop,\r\n\t\t\t\t\t\t\t\t\t\t\t\t DistanceBetweenDepotAndEndStop,\r\n\t\t\t\t\t\t\t\t\t\t\t\t TotalRouteDistance) Values('" . intval(return_between($indirectRouteErrorCodeArray[$i], "<ErrorCode>", "</ErrorCode>", EXCL)) . "','" . $startStop . "','" . $endStop . "','" . $depotNameString . "','" . return_between($indirectStartBusesArray[$i], "<StartBuses>", "</StartBuses>", EXCL) . "','" . return_between($indirectEndBusesArray[$i], "<EndBuses>", "</EndBuses>", EXCL) . "','" . floatval($distanceBetweenDepotAndStartStop) . "','" . floatval($distanceBetweenDepotAndEndStop) . "','" . floatval($totalRouteDistance) . "')"; echo $sql; /*$result=mysql_query($sql); if($result) echo "insert is successful"; else echo "<b>INSERT ERROR: ".mysql_error()."</b>". $sql."<br/>"; */
function category_history_all($id, $param, $version) { $pg_id=postgre_escape($id); $list=array(); if(!$version) { $res=sql_query("select * from category_current where category_id=$pg_id", $db_central); $elem=pg_fetch_assoc($res); $version=$elem['version']; } if($id) $res=sql_query("select * from category where category_id=$pg_id", $db_central); else { $pg_version=postgre_escape($version); $res=sql_query("select * from category where category_id=(select category_id from category first where first.version=$pg_version)", $db_central); } while($elem=pg_fetch_assoc($res)) { $elem['category_id']=$elem['category_id']; $elem['parent_versions']=parse_array($elem['parent_versions']); $elem['version_tags']=parse_hstore($elem['version_tags']); $list[$elem['version']]=$elem; } return array($list, $version); }
continue; } } else { $strHTML = $seed['strHTML']; } /*Get headers just this time for fun, please*/ /* if ($seed["strHeader"]==NULL) { $downloaded_page = http_get_withheader($seed["strURL"], ""); $strHeader = substr($downloaded_page['FILE'],0,strpos($downloaded_page['FILE'],"<")); $strSQL = "UPDATE tblPages SET " . "strHeader='" .mysql_real_escape_string($strHeader) . "' WHERE iPageID=" . $seed["iPageID"]; db_run_query($strSQL); }*/ /*End insert*/ echo "Parsing....\n"; $anchor_tags = parse_array($strHTML, "<a ", "</a>", EXCL); # Put http attributes for each tag into an array $sqlQuery = "INSERT INTO tblLinks(fkParentID,fkChildID,fkQueryID,iNumberTimes) VALUES "; //print "1 sqlQuery is $sqlQuery\n"; $outputExists = false; for ($xx = 0; $xx < count($anchor_tags); $xx++) { //print "tags : ". $anchor_tags[$xx]. "\n"; $href = get_attribute($anchor_tags[$xx], "href"); //print "href = $href , page_base = $page_base \n"; if ($href === false) { continue; } $resolved_address = resolve_address($href, $page_base); //echo "have address: $resolved_address\n"; if (!exclude_link($resolved_address)) { try {
$table_array = parse_array($web_page['FILE'], "<table>", "</table>"); for($l=0; $l<count($table_array); $l++) echo"<br /><b>page_var_table".$k."=:". strip_tags($table_array[$l])."\n"; echo"<br />"; $img_tag_array = parse_array($web_page['FILE'], "<img", ">"); for($h=0; $h<count($img_tag_array); $h++) echo"<br /><b>page_var_image".$k."=:". htmlentities($img_tag_array[$h])."\n"; echo"<br />"; } $link_tag_array = parse_array($web_page['FILE'], "<a", ">"); for($v=0; $v<count($link_tag_array); $v++) { echo"<br /><b>page_var_image".$k."=:". htmlentities($link_tag_array[$v])."\n"; echo"<br />"; } } else { $status = "$site is not valid"; } ?>
else { $OptionResults = array(); $OptionResults['value'] = $optionvalue; $OptionResults['label'] = $optionlabel; } array_push($SelectResults['options'], $OptionResults); } array_push($FormResults['selects'], $SelectResults); } $beg_tag = "<input"; $close_tag = ">"; $InputArray = parse_array($form, $beg_tag, $close_tag); $HowManyInput = count($InputArray); //echo "How Many Inputs? " . $HowManyInput . "<br />"; foreach ($InputArray as $Input) { $Begin_Tag = 'id="'; $End_Tag = '"'; $inputid = return_between($Input, $Begin_Tag, $End_Tag, EXCL); $inputid = str_replace(chr(34),"",$inputid); //$inputid = get_attribute($Input,'id'); //echo "input id: " . $inputid . "<br />"; $pos = strpos($inputid, '='); if ($pos !== false) {
$this_image_file = download_binary_file($books[$bookCount]['imageUrl'], $ref = ""); if (stristr($books[$bookCount]['imageUrl'], ".jpg") || stristr($books[$bookCount]['imageUrl'], ".gif") || stristr($books[$bookCount]['imageUrl'], ".png")) { file_put_contents($save_image_directory . basename($books[$bookCount]['imageUrl']), $this_image_file); } } $divClass = 'views-field-title'; if (stristr($div, $divClass)) { $books[$bookCount]['title'] = trim(strip_tags($div)); $aTag = parse_array($div, '<a', '</a>'); if ($cloudflare == 1) { $books[$bookCount]['bookUrl'] = resolve_address(str_replace("www", "ftp", get_attribute($aTag[0], $attribute = "href")), $page_base); } else { $books[$bookCount]['bookUrl'] = resolve_address(get_attribute($aTag[0], $attribute = "href"), $page_base); } $bookPage[$bookCount] = http_get($books[$bookCount]['bookUrl'], $target); $bookDivs[$bookCount] = parse_array($bookPage[$bookCount]['FILE'], "<div class=\"product-body\"", "</div>"); $books[$bookCount]['summary'] = $bookDivs[$bookCount][0]; } $divClass = 'views-field-field-author-value'; if (stristr($div, $divClass)) { $books[$bookCount]['author'] = trim(strip_tags($div)); } $divClass = 'views-field-field-isbn13-value'; if (stristr($div, $divClass)) { $books[$bookCount]['ISBN13'] = trim(strip_tags($div)); } $divClass = 'views-field-field-released-value'; if (stristr($div, $divClass)) { $books[$bookCount]['relesedDate'] = trim(strip_tags($div)); } $divClass = 'views-field-sell-price';
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ //Include Libraries include "../library/LIB_parse.php"; include "../library/LIB_http.php"; include "../library/LIB_simple_spider.php"; include "../library/LIB_resolve_addresses.php"; include "../library/LIB_download_images.php"; $base_url = "http://www.jboss.org/projects"; //Arrays $projectNames = array(); $web_page = http_get($target = $base_url, $referrer = "J.A.R.V.I.S. Web Bot"); $name_excl = parse_array($web_page['FILE'], "</span>", "</a>"); for ($i = 0; $i < count($name_excl); $i++) { $aRemove = strstr($name_excl[$i], "</a>", true); $replace = str_replace(" ", "", $aRemove); $spanSubRemove = substr($replace, 7); array_push($projectNames, $spanSubRemove); } array_shift($projectNames); //var_dump($projectNames); for ($x = 0; $x < count($projectNames); $x++) { $command = "python getsploits.py -o " . $projectNames[$x] . ".txt" . " --type any \"" . $projectNames[$x] . "\""; system($command); }
#------------------------------------------------- # Start: Look for the $desired_site in each result case for ($page_rank = 0; $page_rank < count($desired_content_array); $page_rank++) { // Look for the $desired_site to appear in one of the listings if (stristr($desired_content_array[$page_rank], trim($desired_site))) { $url_found_rank_on_page = $page_rank; // add one to compensate for listing 0 $url_found = true; } } # End: Parsing content #------------------------------------------------- #------------------------------------------------- # Start: Get location of the next page // Create an array of links on this page $search_links = parse_array($result['FILE'], "<a", "</a>", EXCL); // Look for the link with the word "Next" in it, as we know this // link contains the address of the next page. for ($xx = 0; $xx < count($search_links); $xx++) { if (strstr($search_links[$xx], "Next")) { $previous_target = $target; $target = get_attribute($search_links[$xx], "href"); // Remember that this path is relative to the target page, // so add protocol and domain $target = "http://www.schrenk.com/nostarch/webbots/search/" . $target; } } # End: Get location of the next page #------------------------------------------------- # Don't seatch forever, stop after 10 pages if ($page_index == 10) {
$handle = opecc_convert("input.txt"); $input_str = iconv("utf-8", "big5", $handle); $input_str = trim($input_str); $action = "http://sunlight.iis.sinica.edu.tw/cgi-bin/text.cgi"; $data_arr = array(); $data_arr["query"] = $input_str; $result = http($action, $ref = "", $method = "POST", $data_arr, EXCL_HEAD); $result_str = $result["FILE"]; $result_arr = parse_array($result_str, "<META ", ">"); $res_url = get_attribute($result_arr[1], "CONTENT"); $res_url = explode("=", $res_url); $res_url = trim($res_url[1], "'"); echo "\n" . "response_url: http://sunlight.iis.sinica.edu.tw/" . $res_url . "\n"; //parse response url $web_page = http_get("http://sunlight.iis.sinica.edu.tw/" . $res_url, $ref = ""); $web_page = trim($web_page["FILE"], "<br>"); $response_link = parse_array($web_page, "<a ", "</a>"); $response_link2 = parse_array($web_page, "<a ", ">"); $search_arr = array("'", '"'); $counter = count($response_link); for ($count = 0; $count < $counter; $count++) { $temp_str = return_between($response_link[$count], "<a>", "</a>", EXCL); $temp_str = explode(">", $temp_str); $link = "http://sunlight.iis.sinica.edu.tw/" . get_attribute(str_replace($search_arr, '"', $response_link2[$count]), "HREF"); $web_page = http_get($link, $ref = ""); $web_page = $web_page["FILE"]; $plain_text = return_between($web_page, "<pre>", "</pre>", EXCL); $plain_text = str_replace("-", "", trim($plain_text)); echo "\n" . $plain_text . "\n"; echo "\n" . $plain_text . "\n"; }
<?php include "application/libraries/LIB_http.php"; include "application/libraries/LIB_parse.php"; include "application/libraries/LIB_thumbnail.php"; // $ref = "http://www.wenku8.cn"; $method = "GET"; $this->load->model("insertmodel"); for ($xx = 1; $xx < 1800; $xx++) { $target = 'http://www.wenku8.cn/wap/article/packshow.php?id=' . $xx . '&type=txtfull'; $web_page = http_get($target, $ref); //print_r($web_page); //<a href="articleinfo.php?id=1200">打工族买屋记</a> //$removed_string=remove($web_page," - TXT","全文下载"); $label = '<card'; $meta_tag_array = parse_array($web_page['FILE'], $label, ">"); $meta_tag_array = str_replace(" ", "", $meta_tag_array); $meta_tag_array = str_replace("-", "", $meta_tag_array); $meta_tag = $meta_tag_array[0]; //<cardid="packshow.php"title="文学少女TXT全文下载"> $meta_tag = split_string($meta_tag, "title=\"", AFTER, EXCL); $novel_name = split_string($meta_tag, "TXT", BEFORE, EXCL); for ($i = 0; $i < count($meta_tag); $i++) { $data = array('novel_id' => $xx, 'novel_name' => $novel_name); $this->insertmodel->insert_Novel($data); } } ?> </div> <p class="footer">Page rendered in <strong>{elapsed_time}</strong> seconds</p> </div>
function remove($string, $open_tag, $close_tag) { # Get array of things that should be removed from the input string $remove_array = parse_array($string, $open_tag, $close_tag); # Remove each occurrence of each array element from string; for ($xx = 0; $xx < count($remove_array); $xx++) { $string = str_replace($remove_array, "", $string); } return $string; }
} echo "FOUND: id_column={$id_column}\n"; echo "FOUND: price_column={$price_column}\n"; echo "FOUND: name_column={$name_column}\n"; # Save the heading row for later use $heading_row = $table_row; } # Detect the end of the desired data $ending_landmark = "Calculate"; if (stristr($product_row_array[$table_row], $ending_landmark)) { echo "PARSING COMPLETE!\n"; break; } # Parse product & price data if (isset($heading_row) && $heading_row < $table_row) { $table_cell_array = parse_array($product_row_array[$table_row], "<td", "</td>"); $product_array[$product_count]['ID'] = strip_tags(trim($table_cell_array[$id_column])); $product_array[$product_count]['NAME'] = strip_tags(trim($table_cell_array[$name_column])); $product_array[$product_count]['PRICE'] = strip_tags(trim($table_cell_array[$price_column])); $product_count++; echo "PROCESSED: Item #{$product_count}\n"; } } } } # Display the collected data for ($xx = 0; $xx < count($product_array); $xx++) { echo "{$xx}. "; echo "ID: " . $product_array[$xx]['ID'] . ", "; echo "NAME: " . $product_array[$xx]['NAME'] . ", "; echo "PRICE: " . $product_array[$xx]['PRICE'] . "\n";
include_once "LIB_simple_spider.php"; // spider routines used by this app. include_once "LIB_db_functions.php"; include_once "LIB_encoding.php"; set_time_limit(0); // Don't let PHP timeout db_connect(); //Before starting, check the domains fields of the database and fill in any missing entries //Also fill in missing //$strSQL="SELECT strURL,strDomain,strHTML FROM tblPages WHERE bolProcessed=0 AND bolCentral=1"; //$result = mysql_query($strSQL,$GLOBALS["db"]) or die('Query failed: ' . mysql_error()); $seed = db_get_next_to_process(); while ($seed != null) { $domain = $seed['strDomain']; $html = $seed['strHTML']; $atags = parse_array($html, "<a", "</a>"); foreach ($atags as $tag) { $destURL = get_attribute($tag, "href"); //echo "destURL: $destURL\n"; if (strpos($destURL, "http://") !== false || strpos($destURL, "https://") !== false) { $destDomain = get_domain($destURL); //echo "Saving To-From: $domain - $destDomain\n"; db_update_domain_links($domain, $destDomain); } } db_marked_processed($seed); $seed = db_get_next_to_process(); } db_close(); echo "Done.\n"; mail($operator_email, "Parse Success", "Finished parsing external linked domains: " . date('Y-m-d H:i:s') . "\n", "FROM: " . $operator_email);
function processRTS($rts) { $replyXML = http_get($target = "http://twitter.com/statuses/show/{$rts}.xml", $referer = "{$tw}"); $replyArray = parse_array($replyXML['FILE'], "<status>", "</status>"); $flag = 1; for ($i = 0; $i < count($replyArray); $i++) { crawlStatus($replyArray[$i]); } }