function load_zip_contents() { $this->TEMP_FILE_PATH = create_temp_dir() . "/"; if ($file_contents = Functions::get_remote_file($this->fishbase_data, array('timeout' => 172800))) { $temp_file_path = $this->TEMP_FILE_PATH . "/fishbase.zip"; $TMP = fopen($temp_file_path, "w"); fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("tar -xzf {$temp_file_path} -C {$this->TEMP_FILE_PATH}"); if (!file_exists($this->TEMP_FILE_PATH . "/taxon.txt")) { $this->TEMP_FILE_PATH = str_ireplace(".zip", "", $temp_file_path); if (!file_exists($this->TEMP_FILE_PATH . "/taxon.txt")) { return; } } $this->text_path['TAXON_PATH'] = $this->TEMP_FILE_PATH . "/taxon.txt"; $this->text_path['TAXON_COMNAMES_PATH'] = $this->TEMP_FILE_PATH . "/taxon_comnames.txt"; $this->text_path['TAXON_DATAOBJECT_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject.txt"; $this->text_path['TAXON_DATAOBJECT_AGENT_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject_agent.txt"; $this->text_path['TAXON_DATAOBJECT_REFERENCE_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject_reference.txt"; $this->text_path['TAXON_REFERENCES_PATH'] = $this->TEMP_FILE_PATH . "/taxon_references.txt"; $this->text_path['TAXON_SYNONYMS_PATH'] = $this->TEMP_FILE_PATH . "/taxon_synonyms.txt"; } else { echo "\n\n Connector terminated. Remote files are not ready.\n\n"; return; } }
function load_xml_string() { $file_contents = ""; debug("Please wait, downloading resource document..."); if (preg_match("/^(.*)\\.(gz|gzip)\$/", $this->xml_path, $arr)) { $path_parts = pathinfo($this->xml_path); $filename = $path_parts['basename']; $temp_dir = create_temp_dir() . "/"; debug("temp file path: " . $temp_dir); if ($file_contents = Functions::get_remote_file($this->xml_path, array('timeout' => 172800))) { $temp_file_path = $temp_dir . "/" . $filename; $TMP = fopen($temp_file_path, "w"); fwrite($TMP, $file_contents); fclose($TMP); shell_exec("gunzip -f {$temp_file_path}"); $this->xml_path = $temp_dir . str_ireplace(".gz", "", $filename); debug("xml path: " . $this->xml_path); } else { debug("Connector terminated. Remote files are not ready."); return false; } echo "\n {$temp_dir} \n"; $file_contents = Functions::get_remote_file($this->xml_path, array('timeout' => 172800)); recursive_rmdir($temp_dir); // remove temp dir echo "\n temporary directory removed: [{$temp_dir}]\n"; } return $file_contents; }
function remove_bhl_images_already_existing_in_eol_group($resource_id) { $file = "http://dl.dropbox.com/u/7597512/BHL_images/BHL_images_in_EOLGroup.txt"; // $file = "http://localhost/cp/BHL/BHL_images/BHL_images_in_EOLGroup.txt"; $contents = Functions::get_remote_file($file, array('timeout' => 600, 'download_attempts' => 5)); $do_ids = json_decode($contents, true); print "\n\n from text file: " . count($do_ids); $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; $xml_string = Functions::get_remote_file($resource_path, array('timeout' => 240, 'download_attempts' => 5)); $xml = simplexml_load_string($xml_string); $i = 0; $deleted_ids = array(); $deleted = 0; foreach ($xml->taxon as $taxon) { $i++; $dwc = $taxon->children("http://rs.tdwg.org/dwc/dwcore/"); echo "\n[" . $dwc->ScientificName . "]"; $j = 0; $deleted_do_keys = array(); foreach ($taxon->dataObject as $do) { $j++; $dc2 = $do->children("http://purl.org/dc/elements/1.1/"); $do_id = trim($dc2->identifier); if (in_array($do_id, $do_ids)) { $deleted++; $deleted_ids[$do_id] = 1; print "\n --- deleting {$do_id}"; $deleted_do_keys[] = $j - 1; } } foreach ($deleted_do_keys as $key) { unset($xml->taxon[$i - 1]->dataObject[$key]); } } print "\n\n occurrence do_ids: {$i}"; print "\n\n deleted <dataObject>s: {$deleted}"; print "\n\n deleted unique do_ids: " . count($deleted_ids); $xml_string = $xml->asXML(); require_library('ResourceDataObjectElementsSetting'); $xml_string = ResourceDataObjectElementsSetting::delete_taxon_if_no_dataObject($xml_string); if (!($WRITE = Functions::file_open($resource_path, "w"))) { return; } fwrite($WRITE, $xml_string); fclose($WRITE); }
function combine_all_xmls($resource_id) { if (!($species_urls = self::get_species_urls())) { return; } debug("\n\n Start compiling all XML..."); $old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = fopen($old_resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $old_resource_path); return; } $str = "<?xml version='1.0' encoding='utf-8' ?>\n"; $str .= "<response\n"; $str .= " xmlns='http://www.eol.org/transfer/content/0.3'\n"; $str .= " xmlns:xsd='http://www.w3.org/2001/XMLSchema'\n"; $str .= " xmlns:dc='http://purl.org/dc/elements/1.1/'\n"; $str .= " xmlns:dcterms='http://purl.org/dc/terms/'\n"; $str .= " xmlns:geo='http://www.w3.org/2003/01/geo/wgs84_pos#'\n"; $str .= " xmlns:dwc='http://rs.tdwg.org/dwc/dwcore/'\n"; $str .= " xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'\n"; $str .= " xsi:schemaLocation='http://www.eol.org/transfer/content/0.3 http://services.eol.org/schema/content_0_3.xsd'>\n"; fwrite($OUT, $str); $i = 0; $total = sizeof($species_urls); foreach ($species_urls as $filename) { $i++; print "\n {$i} of {$total}"; sleep(2); $contents = Functions::get_remote_file($filename); if ($xml = simplexml_load_string($contents)) { $contents = str_ireplace("http://creativecommons.org/licenses/by-nc-sa/2.5/mx/", "http://creativecommons.org/licenses/by-nc-sa/2.5/", $contents); if ($contents) { $pos1 = stripos($contents, "<taxon>"); $pos2 = stripos($contents, "</response>"); $str = substr($contents, $pos1, $pos2 - $pos1); fwrite($OUT, $str); } } else { print "\n {$filename} - invalid XML"; continue; } } fwrite($OUT, "</response>"); fclose($OUT); print "\n All XML compiled\n -end-of-process- \n"; }
function clean_media_extension($resource_id, $dwca_file) { require_library('connectors/INBioAPI'); $func = new INBioAPI(); if ($paths = $func->extract_archive_file($dwca_file, "meta.xml")) { print_r($paths); if ($contents = Functions::get_remote_file($paths['archive_path'] . "media.txt", array('timeout' => 172800))) { $contents = str_ireplace('<a title=""', '<a title="', $contents); $contents = str_ireplace('"" href=""', '" href="', $contents); $contents = str_ireplace('"">', '">', $contents); //saving new media.txt if (!($WRITE = fopen($paths['archive_path'] . "media.txt", "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $paths['archive_path'] . "media.txt"); return; } fwrite($WRITE, $contents); fclose($WRITE); // remove the archive file e.g. plazi.zip $info = pathinfo($dwca_file); unlink($paths['archive_path'] . $info["basename"]); // creating the archive file $command_line = "tar -czf " . CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".tar.gz --directory=" . $paths['archive_path'] . " ."; $output = shell_exec($command_line); // moving files to /resources/ recursive_rmdir(CONTENT_RESOURCE_LOCAL_PATH . $resource_id); if (!file_exists(CONTENT_RESOURCE_LOCAL_PATH . $resource_id)) { mkdir(CONTENT_RESOURCE_LOCAL_PATH . $resource_id); } $src = $paths['archive_path']; $dst = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . "/"; $files = glob($paths['archive_path'] . "*.*"); foreach ($files as $file) { $file_to_go = str_replace($src, $dst, $file); copy($file, $file_to_go); } } // remove temp dir recursive_rmdir($paths['archive_path']); echo "\n temporary directory removed: " . $paths['archive_path']; } }
$url_list_of_group_ids = "http://www.morphbank.net/eolids.xml"; */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $mysqli =& $GLOBALS['mysqli_connection']; $resource_id = 83; $details_method_prefix = "http://services.morphbank.net/mb3/request?method=id&format=svc&limit=2&id="; $image_ids = array(); $schema_taxa = array(); $used_taxa = array(); $url_list_of_image_ids = "http://services.morphbank.net/mb3/request?method=eol&format=id&limit=-1"; /* Excludes MorphBank IDs as suggested by BioImages Vanderbuilt */ $excluded_MorphBank_IDs = prepare_excluded_ids(); if ($url_list_of_image_ids) { print "\n [url_list_of_image_ids: {$url_list_of_image_ids}] \n"; $response = Functions::get_remote_file($url_list_of_image_ids, array('download_wait_time' => 1000000, 'timeout' => 600, 'download_attempts' => 5)); $image_id_xml = simplexml_load_string($response); if ($image_id_xml) { foreach ($image_id_xml->id as $id) { $image_ids[] = $id; } } } $total_image_ids = count($image_ids); print "\n count of image ID's = {$total_image_ids}"; if ($total_image_ids == 0) { exit("\n Program will terminate. MorphBank service not ready."); } /* loop through image ids */ $k = 0; foreach ($image_ids as $image_id) {
public static function lookup_with_cache($url, $options = array()) { // default expire time is 30 days if (!isset($options['expire_seconds'])) { $options['expire_seconds'] = 2592000; } if (!isset($options['timeout'])) { $options['timeout'] = 120; } // if(!isset($options['cache_path'])) $options['cache_path'] = DOC_ROOT . "tmp/cache/"; if (!isset($options['cache_path'])) { $options['cache_path'] = "/Volumes/Eli black/eol_cache/"; } $md5 = md5($url); $cache1 = substr($md5, 0, 2); $cache2 = substr($md5, 2, 2); if ($resource_id = @$options['resource_id']) { $options['cache_path'] .= "{$resource_id}/"; if (!file_exists($options['cache_path'])) { mkdir($options['cache_path']); } } if (!file_exists($options['cache_path'] . $cache1)) { mkdir($options['cache_path'] . $cache1); } if (!file_exists($options['cache_path'] . "{$cache1}/{$cache2}")) { mkdir($options['cache_path'] . "{$cache1}/{$cache2}"); } $cache_path = $options['cache_path'] . "{$cache1}/{$cache2}/{$md5}.cache"; if (file_exists($cache_path)) { $file_contents = file_get_contents($cache_path); $cache_is_valid = true; if (@$options['validation_regex'] && !preg_match("/" . $options['validation_regex'] . "/ims", $file_contents)) { $cache_is_valid = false; } if ($file_contents && $cache_is_valid || strval($file_contents) == "0" && $cache_is_valid) { $file_age_in_seconds = time() - filemtime($cache_path); if ($file_age_in_seconds < $options['expire_seconds']) { return $file_contents; } if ($options['expire_seconds'] === false) { return $file_contents; } } @unlink($cache_path); } $file_contents = Functions::get_remote_file($url, $options); if ($FILE = Functions::file_open($cache_path, 'w+')) { fwrite($FILE, $file_contents); fclose($FILE); } else { if (!($h = Functions::file_open(DOC_ROOT . "/public/tmp/cant_delete.txt", 'a'))) { return; } fwrite($h, $cache_path . "\n"); fclose($h); } return $file_contents; }
public static function pools_get_photos($group_id, $machine_tag, $per_page, $page, $auth_token = "", $user_id = NULL, $start_date = NULL, $end_date = NULL) { $extras = "last_update,media,url_o"; $url = self::generate_rest_url("flickr.groups.pools.getPhotos", array("group_id" => $group_id, "machine_tags" => $machine_tag, "extras" => $extras, "per_page" => $per_page, "page" => $page, "auth_token" => $auth_token, "user_id" => $user_id, "format" => "json", "nojsoncallback" => 1), 1); if (in_array($user_id, array(FLICKR_BHL_ID, FLICKR_SMITHSONIAN_ID))) { /* remove group_id param to get images from photostream, and not only those in the EOL Flickr group */ $url = self::generate_rest_url("flickr.photos.search", array("machine_tags" => $machine_tag, "extras" => $extras, "per_page" => $per_page, "page" => $page, "auth_token" => $auth_token, "user_id" => $user_id, "license" => "1,2,4,5,7", "privacy_filter" => "1", "sort" => "date-taken-asc", "min_taken_date" => $start_date, "max_taken_date" => $end_date, "format" => "json", "nojsoncallback" => 1), 1); } return json_decode(Functions::get_remote_file($url, array('timeout' => 30))); }
function assemble_xml_files() { $arr_taxa = array(); $arr_predator = array(); $arr_prey = array(); $arr_ref = array(); for ($i = 1; $i <= 259; $i++) { print "\n {$i} ---" . SPIRE_SERVICE . $i; if (!($str = Functions::get_remote_file(SPIRE_SERVICE . $i))) { echo "\n\nSPIRE service not available at the moment.\n\n"; return false; } $str = str_replace('rdf:resource', 'rdf_resource', $str); $str = utf8_encode($str); $xml = simplexml_load_string($str); foreach ($xml->ConfirmedFoodWebLink as $rec) { foreach ($rec->predator[0]->attributes() as $attribute => $value) { $arr = parse_url($value); $predator = trim(@$arr['fragment']); $predator = str_replace("_", " ", $predator); } $pred_desc = trim($rec->predator_description); foreach ($rec->prey[0]->attributes() as $attribute => $value) { $arr = parse_url($value); $prey = trim(@$arr['fragment']); $prey = str_replace("_", " ", $prey); } $prey_desc = trim($rec->prey_description); foreach ($rec->observedInStudy[0]->attributes() as $attribute => $value) { $arr = parse_url($value); $ref_num = trim($arr['fragment']); } $arr_taxa[$predator]['desc'] = $pred_desc; $arr_taxa[$prey]['desc'] = $prey_desc; if (!@$arr_predator[$predator]) { $arr_predator[$predator][] = $prey; } if (!@$arr_prey[$prey]) { $arr_prey[$prey][] = $predator; } if (!in_array($prey, $arr_predator[$predator])) { $arr_predator[$predator][] = $prey; } if (!in_array($predator, $arr_prey[$prey])) { $arr_prey[$prey][] = $predator; } if (!@$arr_ref[$ref_num]['predator']) { $arr_ref[$ref_num]['predator'][] = $predator; } if (!@$arr_ref[$ref_num]['prey']) { $arr_ref[$ref_num]['prey'][] = $prey; } if (!in_array($predator, $arr_ref[$ref_num]['predator'])) { $arr_ref[$ref_num]['predator'][] = $predator; } if (!in_array($prey, $arr_ref[$ref_num]['prey'])) { $arr_ref[$ref_num]['prey'][] = $prey; } } foreach ($xml->Study as $rec) { $habitats = array(); foreach ($rec->ofHabitat as $habitat) { foreach ($habitat->attributes() as $attribute => $value) { $arr = parse_url($value); $habitat = trim($arr['fragment']); $habitats[] = str_replace("_", " ", $habitat); } } $habitats = implode(", ", $habitats); if ($habitats == "unknown") { $habitats = ""; } $place = self::parse_locality(trim($rec->locality)); $country = @$place["country"]; $state = @$place["state"]; $locality = @$place["locality"]; //debug /* if ( is_numeric(stripos(trim($rec->titleAndAuthors),"Animal Diversity Web")) || is_numeric(stripos(trim($rec->titleAndAuthors),"Rockefeller")) || is_numeric(stripos(trim($rec->titleAndAuthors),"data base of food webs")) || is_numeric(stripos(trim($rec->titleAndAuthors),"foodwebs")) || is_numeric(stripos(trim($rec->titleAndAuthors),"Webs on the Web")) || is_numeric(stripos(trim($rec->titleAndAuthors),"NCEAS")) || is_numeric(stripos(trim($rec->titleAndAuthors),"Interaction Web Database")) || is_numeric(stripos(trim($rec->titleAndAuthors),"Co-Operative Web Bank")) ) {print"\n problem here: [$i] [trim($rec->titleAndAuthors)]";} */ $titleAndAuthors = trim($rec->titleAndAuthors); if ($titleAndAuthors == "Animal Diversity Web") { $titleAndAuthors = "Myers, P., R. Espinosa, C. S. Parr, T. Jones, G. S. Hammond, and T. A. Dewey. 2006. The Animal Diversity Web (online). Accessed February 16, 2011 at http://animaldiversity.org. http://www.animaldiversity.org"; } $reference[$ref_num] = array("titleAndAuthors" => $titleAndAuthors, "publicationYear" => trim($rec->publicationYear), "place" => trim($rec->locality), "country" => $country, "state" => $state, "locality" => $locality, "habitat" => $habitats); } } //main loop 1-259 //for ancestry require_library('XLSParser'); $parser = new XLSParser(); $names = $parser->convert_sheet_to_array(SPIRE_PATH_ANCESTRY); $ancestry = array(); foreach ($arr_taxa as $taxon => $temp) { $arr_taxa[$taxon]['objects'] = array("predator" => @$arr_predator[$taxon], "prey" => @$arr_prey[$taxon]); //start ancestry $key = array_search(trim($taxon), $names['tname']); if (strval($key) != "") { $parent_id = $names['parent_id'][$key]; $ancestry = self::get_ancestry($key, $names); $arr_taxa[$taxon]['ancestry'] = $ancestry; } } /* print"<pre>"; print_r($arr_taxa); print_r($arr_ref); print_r($reference); print"</pre>"; */ return array($arr_taxa, $arr_ref, $reference); }
private function get_main_groups() { $groups = array(); if ($html = Functions::get_remote_file($this->domain, array('timeout' => 9600, 'download_attempts' => 2, 'delay_in_minutes' => 5))) { if (preg_match_all("/href=\"vm_search\\.php(.*?)\"/ims", $html, $match)) { foreach ($match[1] as $line) { if (preg_match("/database\\=(.*?)\\&/ims", $line, $match2)) { $groups[] = $match2[1]; } } } } else { echo "\n investigate: main site is down\n"; } print_r($groups); return $groups; }
function load_zip_contents() { $this->TEMP_FILE_PATH = create_temp_dir() . "/"; if ($file_contents = Functions::get_remote_file($this->zip_path, array('timeout' => 999999, 'download_attempts' => 5))) { $parts = pathinfo($this->zip_path); $temp_file_path = $this->TEMP_FILE_PATH . "/" . $parts["basename"]; if (!($TMP = fopen($temp_file_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $temp_file_path); return; } fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("unzip {$temp_file_path} -d {$this->TEMP_FILE_PATH}"); if (file_exists($this->TEMP_FILE_PATH . "/all.xml")) { return TRUE; } else { return FALSE; } } else { debug("\n\n Connector terminated. Remote files are not ready.\n\n"); return FALSE; } }
function process_file1($file, $doc_id) { global $wrap; global $used_taxa; print "{$wrap}"; $str = Functions::get_remote_file($file); $str = clean_str($str); $str = str_ireplace('<br><br>', "&arr[]=", $str); $str = trim($str); $str = substr($str, 0, strlen($str) - 7); //to remove last part of string "&arr[]=" //print "<hr>$str"; $arr = array(); parse_str($str); print "after parse_str recs = " . count($arr) . "{$wrap} {$wrap}"; //print_r($arr); //print"<pre>";print_r($arr);print"</pre>"; $i = 0; foreach ($arr as $str) { $str = clean_str($str); $str = str_ireplace("< /i>", "</i>", $str); //if($i >= 5)break; //debug //ditox $i++; // if(in_array($i,array(8))){ if (true) { //<b><i>Abrus precatorius</i></b> //get sciname $beg = '<b>'; $end1 = '</i></b>'; $end2 = '</i>'; $end3 = '</b>'; $sciname = strip_tags(trim(parse_html($str, $beg, $end1, $end2, $end3, $end1, ""))); $sciname = str_ireplace(chr(13), "", $sciname); $sciname = str_ireplace(chr(10), "", $sciname); $sciname = trim($sciname); //get desc $str .= "xxx"; $beg = '</i></b>'; $end1 = 'xxx'; $desc = strip_tags(trim(parse_html($str, $beg, $end1, $end1, $end1, $end1, ""))); $last_char_of_desc = substr($desc, strlen($desc) - 1, 1); if ($last_char_of_desc == ",") { $desc = substr($desc, 0, strlen($desc) - 1); } $desc .= "."; if ($sciname == "") { print "jjj"; } print "{$i}. {$sciname} {$wrap}"; //print "$desc"; prepare_agent_rights($doc_id, $sciname, $desc); } } //main loop }
function build_id_list() { if (!($OUT = fopen($this->TEMP_FILE_PATH . "tropicos_ids.txt", "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $this->TEMP_FILE_PATH . "tropicos_ids.txt"); return; } $startid = 0; // debug orig value 0; 1600267 with mediaURL and <location>; 1201245 with thumbnail size images //pagesize is the no. of records returned from Tropicos master list service $pagesize = 1000; // debug orig value 1000 $count = 0; while (true) { $count++; $url = TROPICOS_API_SERVICE . "List?startid={$startid}&PageSize={$pagesize}&apikey=" . TROPICOS_API_KEY . "&format=json"; echo "\n[{$count}] {$url}"; if ($json_ids = Functions::get_remote_file($url, DOWNLOAD_WAIT_TIME, array('timeout' => 4800, 'download_attempts' => 5))) { $ids = json_decode($json_ids, true); $str = ""; foreach ($ids as $id) { if ($id["NameId"]) { $str .= $id["NameId"] . "\n"; $startid = $id["NameId"]; } else { echo "\n nameid undefined"; } } $startid++; // to avoid duplicate ids, set next id to get if ($str != "") { fwrite($OUT, $str); } } else { echo "\n --server not accessible-- \n"; break; } if ($count == 1300) { break; } // normal operation // break; //debug } fclose($OUT); }
private function load_zip_contents($zip_path, $download_options, $files, $extension) { $text_path = array(); $temp_path = create_temp_dir(); if ($file_contents = Functions::get_remote_file($zip_path, $download_options)) { $parts = pathinfo($zip_path); $temp_file_path = $temp_path . "/" . $parts["basename"]; if (!($TMP = Functions::file_open($temp_file_path, "w"))) { return; } fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("unzip {$temp_file_path} -d {$temp_path}"); if (file_exists($temp_path . "/" . $files[0] . $extension)) { foreach ($files as $file) { $text_path[$file] = $temp_path . "/" . $file . $extension; } } else { return; } } else { debug("\n\n Connector terminated. Remote files are not ready.\n\n"); } return $text_path; }
<?php namespace php_active_record; include_once dirname(__FILE__) . "/../../config/environment.php"; $new_resource_path = DOC_ROOT . "temp/22.xml.gz"; $new_resource = Functions::get_remote_file("http://animaldiversity.ummz.umich.edu/XML/adw_eol.xml.gz"); // $new_resource = Functions::get_remote_file("http://localhost/eol_php_code/applications/content_server/resources/adw_eol.xml.gz"); if (!($OUT = fopen($new_resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $new_resource_path); return; } fwrite($OUT, $new_resource); fclose($OUT); shell_exec("gunzip -f " . $new_resource_path); $new_resource_path = DOC_ROOT . "temp/22.xml"; $xml = file_get_contents($new_resource_path); // $xml = str_replace("<dc:description>", "<dc:description><![CDATA[", $xml); // $xml = str_replace("</dc:description>", "]]></dc:description>", $xml); $xml = preg_replace("/<a>([^<]+)<\\/a>/", "\\1", $xml); if (substr_count($xml, "<?xml") == 0) { $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" . $xml; } $old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . "22.xml"; if (!($OUT = fopen($old_resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $old_resource_path); return; } fwrite($OUT, $xml); fclose($OUT); shell_exec("rm " . $new_resource_path);
function get_taxa_from_html($url) { $html = Functions::get_remote_file($url, array('timeout' => 1200, 'download_attempts' => 5)); //20mins download timeout, 5 retry attempts $parts = explode("<tr valign=bottom>", $html); // the first block doesn't contain name information so remove it array_shift($parts); // each block corresponds to a Family and its species foreach ($parts as $html_block) { // the last block will also have the tail end of the HTML which we also don't need if (preg_match("/^(.*?)<\\/table>/ims", $html_block, $arr)) { $html_block = $arr[1]; } // pull out the family if (preg_match("/<b>(.*?)<\\/b>/ims", $html_block, $arr)) { $family = trim($arr[1]); } else { continue; } // sometimes the Family is really => ORDER: Family // Families can be Incertae Sedis, Genera Incertae Sedis, Genus Incertae Sedis, ... if (preg_match("/^([a-z]+): (.+)\$/ims", $family, $arr)) { $family = ucfirst(strtolower($arr[2])); $this->family_orders[$family] = ucfirst(strtolower($arr[1])); } if (preg_match_all("/<tr><td>(.*?)<\\/td><td><a href=\"species.jsp\\?avibaseid=(.*?)\">(.*?)<\\/a><\\/td><td>(.*?)<\\/td><\\/tr>/ims", $html_block, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $common_name = trim($match[1]); $avibaseid = trim($match[2]); $taxon_name = trim($match[3]); $conservation_status = trim($match[4]); if (preg_match("/<i>(.*?)<\\/i>/ims", $taxon_name, $arr)) { $taxon_name = trim($arr[1]); } if ($metadata = @$this->names_in_families[$taxon_name]) { // this means that in one regional checklist they place this taxon in a different family if ($metadata['family'] != $family) { debug("Family Conflict with {$taxon_name}\n"); continue; } // this means that in one regional checklist they use a different URL for the taxon if ($metadata['avibaseid'] != $avibaseid) { debug("ID Conflict with {$taxon_name}\n"); continue; } } $this->names_in_families[$taxon_name] = array('taxon_name' => $taxon_name, 'family' => $family, 'common_name' => $common_name, 'avibaseid' => $avibaseid, 'conservation_status' => $conservation_status); } } if ($this->for_testing) { break; } } }
private function load_zip_contents() { $this->TEMP_FILE_PATH = create_temp_dir() . "/"; $options = $this->download_options; $options['timeout'] = 999999; if ($file_contents = Functions::get_remote_file($this->zip_path, $options)) { $parts = pathinfo($this->zip_path); $temp_file_path = $this->TEMP_FILE_PATH . "/" . $parts["basename"]; if (!($TMP = fopen($temp_file_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $temp_file_path); return; } fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("unzip {$temp_file_path} -d {$this->TEMP_FILE_PATH}"); if (!file_exists($this->TEMP_FILE_PATH . "/IRMNG_DWC_20140131.csv")) { $this->TEMP_FILE_PATH = str_ireplace(".zip", "", $temp_file_path); if (!file_exists($this->TEMP_FILE_PATH . "/IRMNG_DWC_20140131.csv")) { return false; } } $this->text_path["IRMNG_DWC"] = $this->TEMP_FILE_PATH . "/IRMNG_DWC_20140131.csv"; $this->text_path["IRMNG_DWC_SP_PROFILE"] = $this->TEMP_FILE_PATH . "/IRMNG_DWC_SP_PROFILE_20140131.csv"; return true; } else { debug("\n\n Connector terminated. Remote files are not ready.\n\n"); return false; } }
<?php namespace php_active_record; /* Tropical Lichens connector estimated execution time: 8 seconds Partner provides a service that resembles an EOL XML. */ include_once dirname(__FILE__) . "/../../config/environment.php"; $timestart = time_elapsed(); $resource_id = 69; $url = 'http://www.tropicallichens.net/eolclient.aspx'; if ($xml_content = Functions::get_remote_file($url)) { $resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = fopen($resource_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path); return; } fwrite($OUT, $xml_content); fclose($OUT); } else { print "\n no contents {$i}"; } $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = " . $elapsed_time_sec . " seconds \n"; echo "elapsed time = " . $elapsed_time_sec / 60 . " minutes \n"; echo "elapsed time = " . $elapsed_time_sec / 60 / 60 . " hours \n"; echo "\n\n Done processing.";
function process($id) { $timestart = time_elapsed(); echo "\n start timer"; $file = WORMS_TAXON_API . $id; echo "{$file}\n"; if ($contents = Functions::get_remote_file($file, array('timeout' => 600, 'download_attempts' => 5))) { if (simplexml_load_string($contents)) { $pos1 = stripos($contents, "<taxon>"); $pos2 = stripos($contents, "</taxon>"); if ($pos1 != "" and $pos2 != "") { $contents = trim(substr($contents, $pos1, $pos2 - $pos1 + 8)); $elapsed_time_sec = time_elapsed() - $timestart; echo "\n"; echo "elapsed time = " . $elapsed_time_sec . " seconds \n"; $this->exec_time_in_seconds += $elapsed_time_sec; return $contents; } } } @($GLOBALS['WORMS_bad_id'] .= $id . ","); return false; }
private function process_fruit_facts() { $fruit_paths = self::get_fruit_paths(); if (!$fruit_paths) { return; } $this->search_terms = self::get_search_terms(); $records = array(); $i = 0; foreach ($fruit_paths as $path) { $i++; // if($i >= 2) break; // debug // if($i != 42) continue; // debug $record = array(); if (preg_match("/\"(.*?)\"/ims", $path, $arr2)) { if ($arr2[1] == "ff/edible-palms.html") { continue; } // mulitiple taxa in a page if ($arr2[1] == "ff/apricot_low_chill.html") { continue; } // different HTML structure $url = $this->fruit_links["domain_fruit_facts"] . $arr2[1]; if ($html = Functions::get_remote_file($url, array('download_wait_time' => 3000000, 'timeout' => 240, 'download_attempts' => 2, 'delay_in_minutes' => 1))) { $record["source_url"] = $url; //manual adjustments $html = str_replace("<b>Soils</b>", "<b>Soils:</b>", $html); $html = str_replace("<b>Related species</b>", "<b>Related species:</b>", $html); $html = str_replace("<b>Adaptation</b>", "<b>Adaptation:</b>", $html); $html = str_replace("<b>Pruning</b>", "<b>Pruning:</b>", $html); if (preg_match_all("/<h2><i>(.*?)<\\/i>/ims", $html, $arr3)) { $record["taxon"] = trim(strip_tags($arr3[1][0])); $record["taxon_id"] = str_replace(" ", "_", $record["taxon"]); $record["family"] = strip_tags($arr3[1][1]); } if (preg_match("/src=\"(.*?)\"/ims", $html, $arr3)) { $record["image"] = $arr3[1]; } if (preg_match("/&\\#169;(.*?)Questions or comments/ims", $html, $arr3)) { $holder = "© " . trim($arr3[1]); $holder = strip_tags($holder, "<a>"); $holder = str_ireplace('href="/index.html"', 'href="' . $this->site_domain . 'index.html"', $holder); $record["rightsHolder"] = strip_tags($holder); } // actual text descriptions foreach ($this->search_terms as $key => $value) { $end_strings = array("<p>", "<h"); // possible end strings foreach ($end_strings as $end_string) { if (isset($record[$value])) { break; } if (preg_match("/{$key}<\\/b>(.*?){$end_string}/ims", $html, $arr3)) { $record[$value] = $arr3[1]; } } } foreach ($record as $key => $value) { $record[$key] = str_ireplace(array("\n"), " ", $value); } $record = array_filter(array_map('trim', $record)); } } print "\n count: " . count($record) . "\n"; if ($record) { $records[$record["taxon"]] = $record; } } // end foreach return $records; }
private function get_html($url) { if ($html = Functions::get_remote_file($url, array('download_wait_time' => 1000000, 'timeout' => 9600, 'download_attempts' => 2, 'delay_in_minutes' => 2))) { return $html; } else { if ($html = self::curl_get_file_contents($url)) { echo "\n Got it using 'curl_get_file_contents()' \n"; return $html; } } return false; }
<?php exit; /* AntWeb is now giving us a complete resource XML with <mediaURL> element for their image objects */ include_once dirname(__FILE__) . "/../../config/environment.php"; $mysqli =& $GLOBALS['mysqli_connection']; $file = trim(Functions::get_remote_file("http://antweb.org/getEOL.do")); //$file = trim(Functions::get_remote_file("../../temp/ants.xml")); //echo "$file"; $xml = simplexml_load_string($file); $taxon_index = 0; foreach ($xml->taxon as $taxon) { $i = 0; $label_index = 0; foreach ($taxon->dataObject as $dataObject) { $i++; $dataObject_dc = $dataObject->children("http://purl.org/dc/elements/1.1/"); if ($identifier = @$dataObject_dc->identifier) { if (preg_match("/^\\/images\\//", $identifier, $arr)) { if (preg_match("/_l_[0-9]{1,}_high\\.jpg/", $identifier)) { $label_index = $i; continue; } //echo "$identifier\n"; $dataObject->addChild("mediaURL", "http://www.antweb.org" . str_replace(" ", "%20", $identifier)); } } } if ($label_index) { $label_index -= 1; //echo "Unsetting taxon[$taxon_index]->dataObject[$label_index]\n";
private function collect_dataset_attribution() { $this->dataset_metadata = array(); if (is_dir($this->harvest_event->resource->archive_path() . "dataset") && file_exists($this->harvest_event->resource->archive_path() . "dataset/col.xml")) { foreach (glob($this->harvest_event->resource->archive_path() . "dataset/*") as $filename) { if (preg_match("/\\/([0-9]+)\\.xml\$/", $filename, $arr)) { $dataset_id = $arr[1]; } $xml = simplexml_load_file($filename); $title = trim($xml->dataset->title); if (preg_match("/^(.*) in the Catalogue of Life/", $title, $arr)) { $title = trim($arr[1]); } $title = str_replace(" ", " ", $title); $editors = trim($xml->additionalMetadata->metadata->sourceDatabase->authorsAndEditors); if (preg_match("/^(.*)\\. For a full list/", $editors, $arr)) { $editors = trim($arr[1]); } if (preg_match("/^(.*); for detailed information/", $editors, $arr)) { $editors = trim($arr[1]); } $editors = str_replace(" ", " ", $editors); $abbreviatedName = trim($xml->additionalMetadata->metadata->sourceDatabase->abbreviatedName); $this->dataset_metadata[$abbreviatedName]['title'] = $title; $this->dataset_metadata[$abbreviatedName]['editors'] = $editors; $this->dataset_metadata[$abbreviatedName]['abbreviatedName'] = $abbreviatedName; $this->dataset_metadata[$abbreviatedName]['datasetID'] = $dataset_id; $this->dataset_metadata[$dataset_id] =& $this->dataset_metadata[$abbreviatedName]; } // now go grab the citation information from the COL website $url = "http://www.catalogueoflife.org/col/info/cite"; $options_for_log_harvest = array('resource_id' => $this->harvest_event->resource->id); $html = Functions::get_remote_file($url, $options_for_log_harvest); preg_match_all("/<p><strong>(.*?)<\\/strong><br\\/>(.*?)<\\/p>/ims", $html, $matches, PREG_SET_ORDER); foreach ($matches as $match) { $dataset_name = $match[1]; if (preg_match("/^(.*) via ITIS/", $dataset_name, $arr)) { $dataset_name = trim($arr[1]); } $citation = $match[2]; if (isset($this->dataset_metadata[$dataset_name])) { $this->dataset_metadata[$dataset_name]['citation'] = $citation; } elseif ($dataset_name == "Species 2000 Common Names" && isset($this->dataset_metadata["Catalogue of Life"])) { $this->dataset_metadata["Catalogue of Life"]['citation'] = $citation; } } if (!isset($this->dataset_metadata["Catalogue of Life"]['citation']) || !isset($this->dataset_metadata["FishBase"]['citation'])) { echo "Tried getting attribution for Catalogue of Life datasets, but there was a problem\n"; write_to_resource_harvesting_log("Tried getting attribution for Catalogue of Life datasets, but there was a problem"); exit; } } }
function get_title_description($type = null, $taxon_name) { foreach ($GLOBALS['taxon'] as $taxon_name => $value) { if (@$GLOBALS['taxon'][$taxon_name]['sciname'] || @$GLOBALS['taxon'][$taxon_name]['texts']) { continue; } sleep(5); //debug $url = $value['html']; if ($this->debug_info) { print "\n\n {$url} -- {$taxon_name}"; } $trials = 1; $success = 0; while ($success == 0 && $trials < 5) { if ($html = Functions::get_remote_file($url)) { $success = 1; } else { $trials++; print "\n Down: {$url}"; print "\n Will wait for 30 seconds and will try again. Trial #" . $trials; sleep(30); } } if ($trials >= 5) { print "\n Will skip to the next species after {$trials} unsuccessful trials"; continue; } if (preg_match("/<FONT SIZE=\"\\+3\">(.*?)<\\/FONT>/ims", $html, $arr)) { $GLOBALS['taxon'][$taxon_name]['sciname'] = self::clean_str(strip_tags($arr[1])); } elseif (preg_match("/<FONT SIZE=\"\\+2\">(.*?)<\\/FONT>/ims", $html, $arr)) { $GLOBALS['taxon'][$taxon_name]['sciname'] = self::clean_str(strip_tags($arr[1])); } $GLOBALS['taxon'][$taxon_name]['sciname'] = str_ireplace('&', '&', $GLOBALS['taxon'][$taxon_name]['sciname']); if (preg_match("/<FONT SIZE=\"\\+4\">(.*?)<\\/FONT>/ims", $html, $arr)) { $GLOBALS['taxon'][$taxon_name]['comnames'][] = self::clean_str($arr[1]); } if ($GLOBALS['taxon'][$taxon_name]['sciname'] == "Didymopanax morototoni (Aubl.) Decne. & Planch.") { $GLOBALS['taxon'][$taxon_name]['sciname'] = "Schefflera morototoni"; $GLOBALS['taxon'][$taxon_name]['comnames'] = array(); } elseif ($GLOBALS['taxon'][$taxon_name]['sciname'] == "Chamaecyparis nootkatensis (D. Don) Spach") { $GLOBALS['taxon'][$taxon_name]['sciname'] = "Cupressus nootkatensis"; $GLOBALS['taxon'][$taxon_name]['comnames'] = array(); } // manual adjustment $html = str_ireplace('<H2></H2>', '', $html); //only for hardwoods $html = str_ireplace('<H3>', '<xxx><H3>', $html); $html = str_ireplace('<H2>', '<xxx><H2>', $html); $html = str_ireplace('<H1>', '<xxx><H1>', $html); //only for hardwoods $html = str_ireplace('<H4>', '<xxx><H4>', $html); //only for hardwoods if ($type == "hardwoods") { $html = str_ireplace('<P><FONT><B></B></FONT></P>', '', $html); //only for hardwoods $html = str_ireplace('<P><FONT><B>', '<P><B>', $html); //only for hardwoods $html = str_ireplace('</B></FONT></P>', '</B></P>', $html); //only for hardwoods $html = str_ireplace('Damaging Agents-Robusta </B>', 'Damaging Agents-</B> Robusta ', $html); $html = str_ireplace('Growth and Yield-Black </B>', 'Growth and Yield-</B> Black ', $html); $html = str_ireplace('Rooting Habit-Aigeiros-</B>', 'Rooting Habit-</B> Aigeiros- ', $html); $html = str_ireplace('Growth and Yield-Bitternut </B>', 'Growth and Yield-</B> Bitternut ', $html); } elseif ($type == "conifers") { $html = str_ireplace('<P><B>Vegetative Reproduction-White-cedar </B>', '<P><B>Vegetative Reproduction-</B> White-cedar ', $html); //only for conifers $html = str_ireplace('<P><B>Seedling Development-Germination </B>', '<P><B>Seedling Development-</B> Germination ', $html); //only for conifers } $html = str_ireplace('<P><B><FONT SIZE="+1">Native Range</FONT></B></P>', '<H3>Native Range</H3>', $html); $html = str_ireplace('<H2>Native Range</H2>', '<H3>Native Range</H3>', $html); // manual adjustment - hardwoods $html = str_ireplace(array("<H1></H1>", "<H2></H2>"), "", trim($html)); // to properly get 'brief summary' $texts = array(); // brief summary - start --------------------- $brief_summary = ""; if ($type == "hardwoods") { if (preg_match("/<\\/FONT><\\/H1>(.*?)<H/ims", $html, $match)) { $brief_summary = $match[1]; } if ($brief_summary == "") { if (preg_match("/<\\/B><\\/P>(.*?)<H/ims", $html, $match)) { $brief_summary = trim($match[1]); } } } else { if (preg_match("/<\\/B><\\/P>(.*?)<H/ims", $html, $match)) { $brief_summary = trim($match[1]); } } $brief_summary_with_all_tags = str_ireplace('<xxx>', '', $brief_summary); $brief_summary = strip_tags($brief_summary_with_all_tags, "<p><i>"); $brief_summary = str_ireplace('ALIGN="CENTER"', '', $brief_summary); if ($brief_summary) { $texts[] = array("title" => "brief summary", "description" => $brief_summary); } $agents_and_family_info = self::get_agents_and_family($brief_summary_with_all_tags, trim($url)); $GLOBALS['taxon'][$taxon_name]['agents'] = $agents_and_family_info['agents']; $GLOBALS['taxon'][$taxon_name]['ancestry']['family'] = $agents_and_family_info['family']; // brief summary - end --------------------- // get "<H2>Special Uses</H2>" and "<H2>Genetics</H2>" independently $special_uses = ""; $genetics = ""; if (preg_match("/<H2>Special Uses<\\/H2>(.*?)<xxx>/ims", $html, $match) || preg_match("/<H1>Special Uses<\\/H1>(.*?)<xxx>/ims", $html, $match)) { $special_uses = $match[1]; } if (preg_match("/<H2>Genetics<\\/H2>(.*?)<xxx>/ims", $html, $match) || preg_match("/<H1>Genetics<\\/H1>(.*?)<xxx>/ims", $html, $match)) { $genetics = trim($match[1]); } if ($genetics == "") { //http://www.na.fs.fed.us/spfo/pubs/silvics_manual/Volume_1/larix/occidentalis.htm if (preg_match("/<H2>Genetics<\\/H2>(.*?)<H2>Literature Cited/ims", $html, $match)) { $genetics = strip_tags(trim($match[1]), "<P><I>"); } } if (preg_match("/<H3>Native Range<\\/H3>(.*?)<xxx>/ims", $html, $match) || preg_match("/Native Range<\\/FONT><\\/H4>(.*?)<xxx>/ims", $html, $match) || preg_match("/Range<\\/FONT><\\/H4>(.*?)<xxx>/ims", $html, $match)) { $native_range = $match[1]; if (preg_match("/<IMG SRC\\=\"(.*?)\"/ims", $match[1], $map)) { $path_parts = pathinfo($url); $map_url = $path_parts['dirname'] . "/" . $map[1]; $texts[] = array("title" => "maps tab", "description" => $map_url); $native_range = str_ireplace($map[1], $map_url, $native_range); } $texts[] = array("title" => "Native Range", "description" => $native_range); } if (preg_match("/<H3>Climate<\\/H3>(.*?)<xxx>/ims", $html, $match) || preg_match("/<H2>Climate<\\/H2>(.*?)<xxx>/ims", $html, $match) || preg_match("/Climate<\\/FONT><\\/H4>(.*?)<xxx>/ims", $html, $match)) { $texts[] = array("title" => "Climate", "description" => $match[1]); } if (preg_match("/<H3>Soils and Topography<\\/H3>(.*?)<xxx>/ims", $html, $match) || preg_match("/<H2>Soils and Topography<\\/H2>(.*?)<xxx>/ims", $html, $match) || preg_match("/Soils and Topography<\\/FONT><\\/H4>(.*?)<xxx>/ims", $html, $match)) { $texts[] = array("title" => "Soils and Topography", "description" => $match[1]); } if (preg_match("/<H3>Associated Forest Cover<\\/H3>(.*?)<xxx>/ims", $html, $match) || preg_match("/<H2>Associated Forest Cover<\\/H2>(.*?)<xxx>/ims", $html, $match) || preg_match("/Associated Forest Cover<\\/FONT><\\/H4>(.*?)<xxx>/ims", $html, $match)) { $texts[] = array("title" => "Associated Forest Cover", "description" => $match[1]); } if (preg_match_all("/<H3>(.*?)<xxx>/ims", $html, $matches)) { foreach ($matches[1] as $match) { $title = ""; $description = ""; if (preg_match("/eee(.*?)<\\/H3>/ims", "eee" . $match, $arr)) { $title = strip_tags(trim($arr[1])); } if (preg_match("/<\\/H3>(.*?)eee/ims", $match . "eee", $arr)) { $description = trim($arr[1]); $texts = self::divide_whole_text_to_texts($description, $texts); } else { echo "\n 111 walang text within texts...\n"; return false; } /* this is if you want to get the entire text section as 1 <dataObject> if($title) $texts[] = array("title" => $title, "description" => $description); */ } } if ($type == "hardwoods") { // e.g. Acer macrophyllum if (preg_match_all("/<H2>(.*?)<xxx>/ims", $html, $matches)) { foreach ($matches[1] as $match) { $title = ""; $description = ""; if (preg_match("/eee(.*?)<\\/H2>/ims", "eee" . $match, $arr)) { $title = strip_tags(trim($arr[1])); } if (preg_match("/<\\/H2>(.*?)eee/ims", $match . "eee", $arr)) { $description = trim($arr[1]); $texts = self::divide_whole_text_to_texts($description, $texts); } else { echo "\n 222 walang text within texts...\n"; return false; } } } // e.g. Acer nigrum if (preg_match_all("/<H4>(.*?)<xxx>/ims", $html, $matches)) { foreach ($matches[1] as $match) { $title = ""; $description = ""; if (preg_match("/eee(.*?)<\\/H4>/ims", "eee" . $match, $arr)) { $title = strip_tags(trim($arr[1])); } if (preg_match("/<\\/H4>(.*?)eee/ims", $match . "eee", $arr)) { $description = trim($arr[1]); $texts = self::divide_whole_text_to_texts($description, $texts); } else { echo "\n 333 walang text within texts...\n"; return false; } } } } if ($genetics) { $texts[] = array("title" => "Genetics", "description" => $genetics); } if ($special_uses) { $texts[] = array("title" => "Special Uses", "description" => $special_uses); } $GLOBALS['taxon'][$taxon_name]['texts'] = $texts; $html = str_ireplace("Literature Cited </H2>", "Literature Cited</H2>", $html); if (preg_match("/Literature Cited<\\/H2>(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } elseif (preg_match("/Literature Cited<\\/H1>(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } elseif (preg_match("/Literature Cited<\\/H3>(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } elseif (preg_match("/Literature Cited<\\/H4>(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } elseif (preg_match("/Literature Cited<\\/FONT><\\/H4>(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } elseif (preg_match("/<\\/B>Literature Cited(.*?)<\\/BODY>/ims", $html, $match)) { $GLOBALS['taxon'][$taxon_name]['taxon_ref'] = $match[1]; } } }
function get_title_description($type = null) { foreach ($GLOBALS['taxon'] as $taxon_name => $value) { // if($taxon_name != "Hylaeus affinis") continue; //debug if (@$value['association'] != "" || @$value['gendesc'] != "") { continue; } $url = $this->path . '/insects/' . $value['html']; if ($type == 'insects') { $url = str_ireplace("/insects/", "/", $url); } $GLOBALS['taxon'][$taxon_name]['html'] = $url; if ($this->debug_info) { echo "\n {$url} -- {$taxon_name}"; } if (!($html = Functions::get_remote_file($url, array('download_wait_time' => 1000000, 'timeout' => 600, 'download_attempts' => 5)))) { echo "\n\n Content partner's server is down4, {$url}\n"; $GLOBALS['taxon'][$taxon_name]['association'] = 'no object'; continue; } if (preg_match("/<big>(.*?)<\\/td>/ims", $html, $match)) { $desc = strip_tags(self::clean_str($match[1]), "<BR><I>"); $desc = self::clean_str($desc); $desc = utf8_encode($desc); $GLOBALS['taxon'][$taxon_name]['association'] = $desc; $GLOBALS['taxon'][$taxon_name]['association_title'] = "Plant-Feeding Insects of <i>{$taxon_name}</i> in Illinois"; if (preg_match("/\\[(.*?)\\]/ims", $desc, $string_match)) { $GLOBALS['taxon'][$taxon_name]['ancestry']['family'] = $string_match[1]; } } } }
private function get_map_data($url) { $rec = array(); if ($html = Functions::get_remote_file($url, array('timeout' => 999999, 'download_attempts' => 2, 'delay_in_minutes' => 2))) { // manual adjustment if ($url == "http://entnemdept.ufl.edu/walker/buzz/334m.htm") { $html = str_ireplace('<div align="center">', '</div><div align="center">', $html); } if (preg_match("/<b>(.*?)<\\/b>/ims", $html, $arr)) { $rec["vernacular"] = strip_tags($arr[1]); } else { echo "\n investigate no vernacular [{$url}]"; } if (preg_match("/<i>(.*?)<\\/i>/ims", $html, $arr)) { $rec["sciname"] = strip_tags($arr[1]); } else { echo "\n investigate no sciname [{$url}]"; } if (preg_match_all("/<div align=\"center\">(.*?)<\\/div>/ims", $html, $arr)) { $temp = $arr[1]; if (@$temp[1]) { $caption = $temp[1]; } elseif (@$temp[0]) { $caption = $temp[0]; } //http://entnemdept.ufl.edu/walker/buzz/302m.htm if (preg_match("/<img src=\"(.*?)\"/ims", $caption, $arr)) { $map_image = $arr[1]; $rec["map"] = $this->sina_domain . $map_image; } else { if ($map_image = self::get_map_image_retry($html)) { $rec["map"] = $this->sina_domain . $map_image; echo "\n retry successfull\n"; } else { echo "\n investigate no map image [{$url}]\n"; echo "\n investigate retry still no map 1 [{$url}]\n"; return array(); } } $caption = trim(strip_tags($caption, "<br><a>")); $caption = str_ireplace(array("\n", chr(13), chr(10), "\t"), "", $caption); if (substr($caption, 0, 4) == "<br>") { $caption = trim(substr($caption, 4, strlen($caption))); } $caption = str_ireplace(array("<br> "), "<br>", $caption); $caption = str_ireplace('"> Computer-generated', '">Computer-generated', $caption); if (preg_match("/<a href=\"(.*?)\">Computer-generated/ims", $caption, $arr)) { $rec["computer_gen_map"] = $this->sina_domain . $arr[1]; } elseif (preg_match("/<a href=\"(.*?)\"> Computer-generated/ims", $caption, $arr)) { $rec["computer_gen_map"] = $this->sina_domain . $arr[1]; } elseif (preg_match("/<a href=\"(.*?)\">County-level distribution map/ims", $caption, $arr)) { $rec["computer_gen_map"] = $this->sina_domain . $arr[1]; } // else echo "\n investigate no computer gen map [$url]\n"; acceptable case //further check for 'computer_gen_map' e.g. http://entnemdept.ufl.edu/walker/buzz/123m.htm or 318m.htm if (is_numeric(stripos(@$rec["computer_gen_map"], "href="))) { for ($x = 0; $x <= 10; $x++) { if (preg_match("/<a href=\"(.*?)xxx/ims", $rec["computer_gen_map"] . "xxx", $arr)) { $rec["computer_gen_map"] = $this->sina_domain . $arr[1]; } else { break; } } } $caption = str_ireplace('href="', 'href="' . $this->sina_domain, $caption); $caption = str_ireplace('Computer-generated distribution map', '<br>See also this computer-generated U.S. distribution map', $caption); $rec["caption"] = $caption; echo "\n caption: [{$caption}]\n"; $rec["as_of"] = self::get_as_of_date($caption); } else { // e.g. http://entnemdept.ufl.edu/walker/buzz/401m.htm echo "\n investigate no <div> [{$url}]\n"; if ($map_image = self::get_map_image_retry($html)) { $rec["map"] = $this->sina_domain . $map_image; if (preg_match("/<p>(.*?)\\./ims", $html, $arr)) { $caption = strip_tags($arr[1]) . "."; $rec["caption"] = $caption; $rec["as_of"] = self::get_as_of_date($caption); } echo "\n retry successfull\n"; } else { echo "\n investigate retry still no map 2 [{$url}]\n"; return array(); } } } else { echo "\n investigate 03 [{$url}]"; } $rec["link_back"] = $url; return $rec; }
function parse_contents($str) { global $wrap; global $site_url; /* it can be: <a href="speciesdetail.cfm?genus=Abyssopathes&subgenus=&species=lyra&subspecies=&synseniorid=9266&validspecies=Abyssopathes%20lyra&authorship=%28Brook%2C%201889%29">Abyssopathes lyra (Brook, 1889)</a> or <a href="speciesdetail_for_nosyn.cfm?species=dentata&genus=Sandalolitha&subgenus=&subspecies=">Sandalolitha dentata Quelch, 1884</a> // */ $temp = ''; $beg = 'speciesdetail.cfm?'; $end1 = '</a>'; $temp = trim(parse_html($str, $beg, $end1, $end1, $end1, $end1, "")); if ($temp == '') { $beg = 'speciesdetail_for_nosyn.cfm?'; $end1 = '</a>'; $temp = trim(parse_html($str, $beg, $end1, $end1, $end1, $end1, "")); } //anemone2/speciesdetail_for_nosyn.cfm?spe $temp = '<a href="' . $site_url . '' . $beg . $temp . "</a>"; //get url_for_main_menu $beg = '="'; $end1 = '">'; $url_for_main_menu = trim(parse_html($temp, $beg, $end1, $end1, $end1, $end1, "")); //print"$wrap [<a href='$url_for_main_menu'>url_for_main_menu</a>]"; //end url_for_main_menu //get sciname $beg = '">'; $end1 = '</a>'; $taxa = trim(parse_html($temp, $beg, $end1, $end1, $end1, $end1, "")); print "{$wrap} taxa[{$taxa}]"; //end sciname $main_menu = Functions::get_remote_file($url_for_main_menu); //get url for images page $url_for_images_page = ""; //"images.cfm?&genus=Abyssopathes&subgenus=&species=lyra&subspecies=&seniorid=9266&validspecies=Abyssopathes%20lyra&authorship=%28Brook%2C%201889%29">Images</a> $beg = 'images.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); $arr_images = array(); if ($temp != "") { $url_for_images_page = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_images_page'>images</a>]"; ///* $arr_images = get_images($url_for_images_page); //*/ } else { print "{$wrap} no images"; } //end url for images page //get url for classification $url_for_classification = ""; //"showclassification2.cfm?synseniorid=2914&genus=Aiptasiogeton&subgenus=&species=eruptaurantia&subspecies=&origgenus=Actinothoe&origspecies=eruptaurantia&origsubspecies=&origsubgenus=&&validspecies=Aiptasiogeton%20eruptaurantia&authorship=%28Field%2C%201949%29">Classification</a> $beg = 'showclassification2.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); if ($temp == "") { //http://hercules.kgs.ku.edu/hexacoral/anemone2/classification_path_no_syn.cfm?genus=Astr%C3%A6a&subgenus=&species=abdita&subspecies= $beg = 'classification_path_no_syn.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); } if ($temp != "") { $url_for_classification = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_classification'>classification</a>]"; $arr_classification = get_tabular_data($url_for_classification, "classification"); if ($arr_classification) { $arr_classification = parse_classification($arr_classification); } } else { print "{$wrap} no classification"; } //end url for classification //get url for strict_synonymy $url_for_strict_synonymy = ""; //"synonymy_strict.cfm?seniorid=2914&validspecies=Aiptasiogeton%20eruptaurantia&authorship=%28Field%2C%201949%29">Strict synonymy</a> $beg = 'synonymy_strict.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); if ($temp != "") { $url_for_strict_synonymy = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_strict_synonymy'>strict_synonymy</a>]"; $arr_synonyms = get_tabular_data($url_for_strict_synonymy, "synonyms"); } else { print "{$wrap} no strict_synonymy"; } //end url for strict_synonymy //get url for references $url_for_references = ""; //"all_mentions_of_names2.cfm?species... $beg = 'all_mentions_of_names.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); if ($temp == "") { $beg = 'all_mentions_of_names2.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); } $arr_references = array(); if ($temp != "") { $url_for_references = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_references'>references</a>]"; $arr_references = get_tabular_data($url_for_references, "references"); //start process $arr = array(); foreach ($arr_references as $value) { $temp = ""; foreach ($value as $item) { $temp .= "." . $item; } $temp = trim(substr($temp, 1, strlen($temp))); //to remove the '.' on the first char //<a href="reference_detail.cfm?ref_number=58&type=Article"> $temp = str_ireplace("reference_detail.cfm", $site_url . "reference_detail.cfm", $temp); //if we want to remove the anchor //$temp = get_str_from_anchor_tag($temp); $arr["{$temp}"] = 1; } $arr_references = array_keys($arr); } else { print "{$wrap} no references"; } //end url for references //get url for common_names $url_for_common_names = ""; //"common.cfm?seniorid=2914&validspecies=Aiptasiogeton%20eruptaurantia&authorship=%28Field%2C%201949%29">Strict synonymy</a> $beg = 'common.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); $arr_common_names = array(); if ($temp != "") { $url_for_common_names = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_common_names'>common_names</a>]"; $arr_common_names = get_tabular_data($url_for_common_names, "common_names"); //start process $arr = array(); foreach ($arr_common_names as $value) { //$temp = strtolower($value[0]); //not a good idea especially for special chars $temp = $value[0]; $temp = trim(get_str_from_anchor_tag($temp)); //print"[$temp]"; $arr["{$temp}"] = 1; } $arr_common_names = array_keys($arr); } else { print "{$wrap} no common_names"; } //end url for common_names //get url for skeletons //e.g. for species (Favites abdita) with skeleton $url_for_skeletons = ""; //http://hercules.kgs.ku.edu/hexacoral/anemone2/skeleton.cfm?genus=Favites&subgenus=&species=abdita&subspecies= $beg = 'skeleton.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); $html_skeletons = ""; if ($temp != "") { $url_for_skeletons = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_skeletons'>skeletons</a>]"; $arr_skeletons = get_tabular_data($url_for_skeletons, "skeletons"); if ($arr_skeletons) { $arr_fields = array("Author", "Skeleton?", "Mineral or Organic?", "Mineral", "Percent Magnesium"); $html_skeletons = arr2html($arr_skeletons, $arr_fields, $url_for_main_menu); $html_skeletons = "<div style='font-size : small;'>{$html_skeletons}</div>"; } } else { print "{$wrap} no skeletons"; } //end url for skeleton //get url for biological_associations $url_for_biological_associations = ""; $beg = 'symbiont_info.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); $html_biological_associations = ""; if ($temp != "") { $url_for_biological_associations = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_biological_associations'>biological_associations</a>]"; $arr_biological_associations = get_tabular_data($url_for_biological_associations, "biological_associations"); $arr_fields = array("Algal symbionts"); $html_biological_associations = arr2html($arr_biological_associations, $arr_fields, $url_for_main_menu); $html_biological_associations = "<div style='font-size : small;'>{$html_biological_associations}</div>"; } else { print "{$wrap} no biological_associations"; } //end url for biological_associations //get url for nematocysts $url_for_nematocysts = ""; $beg = 'cnidae_information.cfm'; $end1 = '">'; $temp = trim(parse_html($main_menu, $beg, $end1, $end1, $end1, $end1, "")); $html_nematocysts = ""; if ($temp != "") { $url_for_nematocysts = $site_url . $beg . $temp; //print"$wrap [<a href='$url_for_nematocysts'>nematocysts</a>]"; $arr_nematocysts = get_tabular_data($url_for_nematocysts, "nematocysts"); $arr_fields = array("Location", "Image", "Cnidae Type", "Range of <br> Lengths (m)", " ", "Range of <br >Widths (m)", "n", "N", "State"); $html_nematocysts = arr2html($arr_nematocysts, $arr_fields, $url_for_main_menu); $html_nematocysts = "<div style='font-size : small;'>{$html_nematocysts}</div>"; //to have the 2nd row have colspan=9 $html_nematocysts = str_ireplace("</th></tr><tr><td>", "</th></tr><tr><td colspan='9'>", $html_nematocysts); } else { print "{$wrap} no nematocysts"; } //end url for nematocysts //get url for specimens /* $url_for_specimens=""; //all_specimens_xml.cfm? $beg='all_specimens_xml.cfm'; $end1='">'; $temp = trim(parse_html($main_menu,$beg,$end1,$end1,$end1,$end1,"")); $arr_specimens=array(); if($temp != "") { $url_for_specimens = $site_url . $beg . $temp; print"$wrap [<a href='$url_for_specimens'>specimens</a>]"; $arr_specimens = get_tabular_data($url_for_specimens,"specimens"); //start process $arr=array(); foreach ($arr_specimens as $value) { $temp = @$value[5]; $arr["$temp"]=1; } $arr_specimens = array_keys($arr); }else print"$wrap no specimens"; */ //end url for specimens //print"<hr>$main_menu"; //======================================================================================== //return array ($id,$image_url,$description,$desc_pic,$desc_taxa,$categories,$taxa,$copyright,$providers,$creation_date,$photo_credit,$outlinks); return array($taxa, $url_for_main_menu, $arr_classification, $arr_images, $html_skeletons, $url_for_skeletons, $html_biological_associations, $url_for_biological_associations, $arr_common_names, $arr_references, $html_nematocysts, $url_for_nematocysts); }
foreach ($resource->attributes() as $a => $b) { $attributes[$a] = $b; } $file_names[trim($attributes["name"])] = 1; } } $start = false; $i = 0; krsort($file_names); foreach ($file_names as $file_name => $v) { $i++; //if($file_name!="5959_tx.xml") continue; //if($file_name!="2006_Huveneers_gg1_tx.xml") continue; echo "{$file_name}<br>\n"; $url = $prefix . $file_name; $file_contents = Functions::get_remote_file($url); if (!$file_contents) { echo "downloading failed\n"; } $file_contents = str_replace("<xhtml:p xmlns:xhtml=\"http://www.w3.org/1999/xhtml\">", htmlspecialchars("<p>"), $file_contents); $file_contents = str_replace("</xhtml:p>", htmlspecialchars("</p>"), $file_contents); if (!($OUT = fopen($download_cache_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $download_cache_path); return; } fwrite($OUT, $file_contents); fclose($OUT); if (filesize($download_cache_path)) { clearstatcache(); echo "{$file_name} - " . filesize($download_cache_path) . "<br>\n"; echo "<hr>Parsing Document {$file_name}<hr>\n";
private function get_species_info_from_site($taxon_id) { if (!($html = Functions::get_remote_file(self::SPECIES_PROFILE_PAGE . $taxon_id, array('download_wait_time' => 5000000, 'timeout' => 20000, 'download_attempts' => 2)))) { echo "\n investigate taxon page down: [{$taxon_id}]\n"; return; } if (preg_match("/Current Listing Status Summary<\\/caption>(.*?)<\\/table>/ims", $html, $matches)) { $html = trim($matches[1]); $html = str_ireplace("displaytagOddRow", "displaytagRow", $html); $html = str_ireplace("displaytagEvenRow", "displaytagRow", $html); if (preg_match_all("/<tr class\\=\"displaytagRow\">(.*?)<\\/tr>/ims", $html, $matches)) { $rows = $matches[1]; $desc = ""; foreach ($rows as $row) { echo "\n ============"; if (preg_match_all("/<td>(.*?)<\\/td>/ims", $row, $matches)) { $column = $matches[1]; $status = $column[0]; $date_listed = $column[1]; $lead_region = strip_tags($column[2]); $where_listed = $column[3]; if (preg_match("/displayListingStatus\\(\"(.*?)\"/ims", $status, $matches)) { $status = $matches[1]; } $desc .= "Status: " . $status . "<br>"; $desc .= "Date Listed: " . $date_listed . "<br>"; $desc .= "Lead Region: " . $lead_region . "<br>"; $desc .= "Where Listed: " . $where_listed . "<br><br>"; } } if ($desc) { return "<b>Current Listing Status Summary</b><br><br>" . $desc . "<br>"; } } } else { echo "\n No Listing Status Summary - {$taxon_id} \n"; } }
public static function get_ref_from_site($dc_source) { $str = Functions::get_remote_file($dc_source); //$beg='"getActiveText()"><nonexplicit>';$end='</nonexplicit>'; $ref = self::get_string_between('\\"getActiveText\\(\\)\\"><nonexplicit>', '<\\/nonexplicit>', $str); //$beg='"getActiveText()"><nonexplicit>';$end='</a>'; $str = self::get_string_between('\\"getActiveText\\(\\)\\"><nonexplicit>', '<\\/a>', $str); $str .= "xxx"; //$beg='<a';$end='xxx'; $str = self::get_string_between('<a', 'xxx', $str); $str = "<a" . $str; $url = self::get_href_from_anchor_tag($str); return array($ref, $url); }