private function process_pages($recs) { foreach ($recs as $rec) { // if($rec->title != "42194843") continue; //debug only // if($rec->title != "42194845") continue; //debug only // if($rec->title != "33870179") continue; //debug only --with copyrightstatus // if($rec->title != "13128418") continue; //debug only --with licensor (13128418, 30413122) // if($rec->title != "42194845") continue; //debug only --without licensor if ($rec->title != "16059324") { continue; } //debug only echo "\n" . $rec->title; $url = $this->wikipedia_api . "?action=query&titles=" . urlencode($rec->title) . "&format=json&prop=revisions&rvprop=content"; $json = Functions::lookup_with_cache($url, array('expire_seconds' => true)); //this expire_seconds should always be true $arr = json_decode($json, true); foreach (@$arr['query']['pages'] as $page) { if ($val = @$page['revisions'][0]['*']) { if ($data = self::parse_wiki_content($val)) { // if(isset($data['Taxa Found in Page (tabular)']['NameConfirmed'])) self::create_archive($data); if (isset($data['Taxa Found in Page']['text'])) { self::create_archive($data); } else { echo "\n[no taxa found for wiki: " . $data['Page Summary']['PageID'] . "]\n"; } } } } } }
public static function get_metadata($url) { $xml = Functions::lookup_with_cache($url, array('validation_regex' => 'xmlns:')); $simple_xml = simplexml_load_string($xml); $params = array(); $dcterms = $simple_xml->children("http://dublincore.org/documents/dcmi-terms/"); $dwc = $simple_xml->children("http://digir.net/schema/conceptual/darwin/2003/1.0"); $params['source'] = (string) $dcterms->identifier; $data_object = $simple_xml->dataObject; $dcterms = $data_object->children("http://dublincore.org/documents/dcmi-terms/"); $params['citation'] = (string) $dcterms->bibliographicCitation; $params['identifier'] = (string) $dcterms->identifier; $params['data_type'] = "http://purl.org/dc/dcmitype/Text"; $params['mime_type'] = "text/html"; $params['license'] = "not applicable"; $params['agents'] = array(); foreach ($data_object->agent as $agent) { $agent_name = (string) $agent; $attr = $agent->attributes(); $agent_role = (string) @$attr['role']; $params['agents'][] = array($agent_name, $agent_role); } print_r($xml); // print_r($params); echo "\n\n\n"; }
private function get_contributors() { $allowed = array("Robertson Ross", "Robertson &", "Bryant Kevin", "Cox Carol & Bob", "Garin James"); if ($html = Functions::lookup_with_cache($this->image_list_page, $this->download_options)) { if (preg_match("/<ul id=\"scbar\"(.*?)<\\/ul>/ims", $html, $arr)) { if (preg_match_all("/<a href=(.*?)<\\/a>/ims", $arr[1], $arr)) { $lines = array(); foreach ($arr[1] as $line) { foreach ($allowed as $str) { if (is_numeric(stripos($line, $str))) { $lines[$line] = ''; break; } } } $contributors = array(); foreach (array_keys($lines) as $line) { if (preg_match("/\"(.*?)\"/ims", $line, $arr)) { $path = $this->domain . trim($arr[1]); } if (preg_match("/<\\/i>(.*?)xxx/ims", $line . "xxx", $arr)) { $contributors[trim($arr[1])] = $path; } } print_r($contributors); return $contributors; } } } return false; }
function get_all_taxa() { $this->uris = self::get_uris(); // print_r($this->uris); exit; // offset = 927 k while caching... $limit = 500; $offset = 2552000; //orig limit=500 offset=0 while (true) { $url = $this->service['specimen'] . "&limit={$limit}&offset={$offset}"; if ($contents = Functions::lookup_with_cache($url, $this->download_options)) { $json = json_decode($contents); // print_r($json); exit(); $returned = count($json->result->records); echo "\ncount: [{$returned}]\n"; // self::process_specimen_records($json); // break; } $offset += $limit; if ($returned < $limit) { break; } } // exit; $this->archive_builder->finalize(TRUE); }
private function query_kingdom_GGBN_info($kingdom) { $records = array(); $rec["source"] = $this->kingdom_service_ggbn . $kingdom; $rec["taxon_id"] = $kingdom; if ($html = Functions::lookup_with_cache($rec["source"], $this->download_options)) { $has_data = false; if (preg_match("/<b>(.*?) entries found/ims", $html, $arr) || preg_match("/<b>(.*?) entry found/ims", $html, $arr)) { print "\n {$kingdom}: " . $arr[1] . "\n"; $pages = self::get_number_of_pages($arr[1]); print "\n pages to access: [{$pages}]\n"; for ($i = 1; $i <= $pages; $i++) { echo "\n {$i} of {$pages} "; if ($i > 1) { $rec["source"] = $this->kingdom_service_ggbn . $kingdom . "&page={$i}"; $html = Functions::lookup_with_cache($rec["source"], $this->download_options); } if ($temp = self::process_html($html, $rec["source"])) { $records = array_merge($records, $temp); } } } } self::create_instances_from_taxon_object($records); }
private function process_html($url, $type) { if ($html = Functions::lookup_with_cache($url, $this->download_options)) { $html = self::clean_html($html); $html = str_ireplace('<td align=center>', '<td>', $html); return self::parse_page($html, $type); } return false; }
function convert_to_dwca($params) { require_library('FlickrAPI'); $auth_token = NULL; // if(FlickrAPI::valid_auth_token(FLICKR_AUTH_TOKEN)) $auth_token = FLICKR_AUTH_TOKEN; $page = 1; $per_page = 500; $url = $this->service['photosets'] . '&photoset_id=' . $params['photoset_id'] . '&user_id=' . $params['flickr_user_id'] . '&per_page=' . $per_page; if ($json = Functions::lookup_with_cache($url . '&page=' . $page, $this->download_options)) { $json = str_replace("\\'", "'", $json); $obj = json_decode($json); $total_pages = ceil($obj->photoset->total / $per_page); echo "\ntotal_pages = {$total_pages}\n"; for ($i = 1; $i <= $total_pages; $i++) { if ($json = Functions::lookup_with_cache($url . '&page=' . $page, $this->download_options)) { $json = str_replace("\\'", "'", $json); $obj = json_decode($json); $k = 0; $total_photos = count($obj->photoset->photo); foreach ($obj->photoset->photo as $rec) { $k++; echo "\n{$i} of {$total_pages} - {$k} of {$total_photos}"; if (!($sciname = self::get_sciname_from_title($rec->title))) { continue; } // if($sciname == "SONY DSC") //debug // { // print_r($rec); // } $photo_response = FlickrAPI::photos_get_info($rec->id, $rec->secret, $auth_token, $this->download_options); $photo = @$photo_response->photo; if (!$photo) { continue; } if ($photo->visibility->ispublic != 1) { continue; } if ($photo->usage->candownload != 1) { continue; } if (@(!$GLOBALS["flickr_licenses"][$photo->license])) { continue; } $data_objects = FlickrAPI::get_data_objects($photo, $params['flickr_user_id']); foreach ($data_objects as $do) { self::create_archive($sciname, $do); } } } $page++; // break; //debug } } $this->archive_builder->finalize(TRUE); }
private function parse_html($url) { $final = array(); if ($html = Functions::lookup_with_cache($url, $this->download_options)) { $html = str_ireplace("APPENDIX: FIRE REGEIME TABLE", "APPENDIX: FIRE REGIME TABLE", $html); $final['source'] = $url; $final['life_form'] = self::get_Raunkiaer_life_form($html, $url); $final['authorship_citation'] = self::get_authorship_citation($html); if (preg_match("/<a name=\"AppendixFireRegimeTable\"(.*?)<a name=\"AppendixB\">/ims", $html, $arr) || preg_match("/<a name='AppendixFireRegimeTable'(.*?)<a name='AppendixB'>/ims", $html, $arr) || preg_match("/<a name='APPENDIX: FIRE REGIME TABLE'(.*?)<a name='REFERENCES'>/ims", $html, $arr) || preg_match("/<a name=\"APPENDIX: FIRE REGIME TABLE\"(.*?)<a name=\"REFERENCES\">/ims", $html, $arr) || preg_match("/<a name=\"APPENDIX: FIRE REGIME TABLE\"(.*?)<a name='REFERENCES'>/ims", $html, $arr) || preg_match("/<a name=\"AppendixFireRegimeTable\"(.*?)<a name='REFERENCES'>/ims", $html, $arr) || preg_match("/<a name='AppendixFireRegimeTable'(.*?)<a name='REFERENCES'>/ims", $html, $arr)) { if (preg_match_all("/<tr>(.*?)<\\/tr>/ims", $arr[1], $arr2)) { $TRs = $arr2[1]; $i = 0; foreach ($TRs as $tr) { $i++; if ($i == 1) { continue; } //exclude first <tr> if (preg_match_all("/<td(.*?)<\\/td>/ims", $tr, $arr3)) { $temp = $arr3[1]; $exclude = array(">Vegetation Community", ">Percent of fires", ">Surface or low", ">Mixed<", "vegetation communities"); if (self::needle_occurs_in_this_haystack($temp[0] . "<", $exclude)) { continue; } if (count($temp) == 1) { $index = self::clean_html(strip_tags("<td" . $temp[0])); } else { if (isset($index)) { if ($to_be_added = self::get_term_to_be_added($temp[0])) { /* // a good way to catch/debug if($to_be_added == "Pacific Northwest") { print_r($temp); echo "\nindex[$index]\n"; } */ if (isset($final['habitat'][$index])) { if (!in_array($to_be_added, @$final['habitat'][$index])) { @($final['habitat'][$index][] = $to_be_added); } } else { @($final['habitat'][$index][] = $to_be_added); } } } } } } } // else echo "\n No <tr>s\n"; } // else echo "\nAPPENDIX: FIRE REGIME TABLE not found\n"; } return $final; }
function start() { $topics = array("About the EoE", "Agricultural & Resource Economics", "Biodiversity", "Biology", "Climate Change", "Ecology", "Environmental & Earth Science", "Energy", "Environmental Law & Policy", "Environmental Humanities", "Food", "Forests", "Geography", "Hazards & Disasters", "Health", "Mining & Materials", "People", "Physics & Chemistry", "Pollution", "Society & Environment", "Water", "Weather & Climate", "Wildlife"); // $topics = array("Biodiversity"); foreach ($topics as $topic) { $this->count = array(); //it initializes every topic if ($OUT = Functions::file_open($this->html_dir . str_replace(" ", "_", $topic) . ".html", "w")) { } else { exit("\nFile access problem.\n"); } $url = $this->search_url . "&q={$topic}"; if ($html = Functions::lookup_with_cache($url, $this->download_options)) { if (preg_match("/page 1 of (.*?)<\\/title>/ims", $html, $arr)) { $count = $arr[1]; for ($i = 1; $i <= $count; $i++) { if ($html = Functions::lookup_with_cache($url . "&page={$i}", $this->download_options)) { if (preg_match_all("/<h1>(.*?)<\\/h1>/ims", $html, $arr)) { print_r($arr[1]); foreach ($arr[1] as $t) { if (preg_match("/>(.*?)<\\/a>/ims", $t, $arr2)) { $new_link_text = $arr2[1]; $word_count = str_word_count($new_link_text); if ($word_count < 3) { $new_link_text .= " ({$topic})"; } //-------------- @$this->count[$new_link_text]++; /* previous if($word_count < 3) $c = ($this->count[$new_link_text] > 1 ? $this->count[$new_link_text] : ''); //ternary else $c = ""; */ $c = $this->count[$new_link_text] > 1 ? $this->count[$new_link_text] : ''; //ternary //-------------- $t = str_replace($arr2[1], $new_link_text . " {$c}", $t); } fwrite($OUT, $t . "<br>"); } // exit; } } } } } fclose($OUT); } }
function __construct($type, $params) { // exit("\n[$value]\n"); if ($type == 'usercontrib') { $namespace['ForReview'] = 5000; $namespace['Published'] = 0; $url = $params['server'] . "/StudentContributions/api.php?action=query&list=usercontribs&ucuser="******"&uclimit=100&ucdir=older&format=json&ucnamespace=" . $namespace[$params['article_type']] . "&ucshow=top"; $json = Functions::lookup_with_cache($url, array('expire_seconds' => 0)); $arr = json_decode($json); $titles = array(); foreach ($arr->query->usercontribs as $item) { $titles[] = array('page_title' => $item->title, 'server' => $params['server']); } $this->body = implode(array_map('api_reader_controller::render_page_row', $titles)); } }
function extract_archive_file($dwca_file, $check_file_or_folder_name, $download_options = array('timeout' => 172800, 'expire_seconds' => 0)) { debug("Please wait, downloading resource document..."); $path_parts = pathinfo($dwca_file); $filename = $path_parts['basename']; $temp_dir = create_temp_dir() . "/"; debug($temp_dir); if ($file_contents = Functions::lookup_with_cache($dwca_file, $download_options)) { $temp_file_path = $temp_dir . "" . $filename; debug("temp_dir: {$temp_dir}"); debug("Extracting... {$temp_file_path}"); if (!($TMP = Functions::file_open($temp_file_path, "w"))) { return; } fwrite($TMP, $file_contents); fclose($TMP); sleep(5); if (preg_match("/^(.*)\\.(tar.gz|tgz)\$/", $dwca_file, $arr)) { $cur_dir = getcwd(); chdir($temp_dir); shell_exec("tar -zxvf {$temp_file_path}"); chdir($cur_dir); $archive_path = str_ireplace(".tar.gz", "", $temp_file_path); } elseif (preg_match("/^(.*)\\.(zip)\$/", $dwca_file, $arr) || preg_match("/mcz_for_eol(.*?)/ims", $dwca_file, $arr)) { shell_exec("unzip -ad {$temp_dir} {$temp_file_path}"); $archive_path = str_ireplace(".zip", "", $temp_file_path); } else { debug("-- archive not gzip or zip. [{$dwca_file}]"); return; } debug("archive path: [" . $archive_path . "]"); } else { debug("Connector terminated. Remote files are not ready."); return; } if (file_exists($temp_dir . $check_file_or_folder_name)) { return array('archive_path' => $temp_dir, 'temp_dir' => $temp_dir); } elseif (file_exists($archive_path . "/" . $check_file_or_folder_name)) { return array('archive_path' => $archive_path, 'temp_dir' => $temp_dir); } else { debug("Can't extract archive file. Program will terminate."); return; } }
function check_if_image_is_broken() { $options = array('download_wait_time' => 1000000, 'timeout' => 900, 'download_attempts' => 1); // 15mins timeout $broken = array(); for ($i = 1; $i <= 58; $i++) { $url = "http://eol.org/collections/94950/images?page={$i}&sort_by=3&view_as=3"; $html = Functions::lookup_with_cache($url, $options); echo "\n{$i}. [{$url}]"; // <a href="/data_objects/26326917"><img alt="84925_88_88" height="68" src="http://media.eol.org/content/2013/09/13/13/84925_88_88.jpg" width="68" /></a> if (preg_match_all("/<a href=\"\\/data_objects\\/(.*?)<\\/a>/ims", $html, $arr)) { $rows = $arr[1]; $total_rows = count($rows); $k = 0; foreach ($rows as $row) { $k++; echo "\n{$i} of 58 - {$k} of {$total_rows}"; if (preg_match("/_xxx(.*?)\"/ims", "_xxx" . $row, $arr)) { $id = $arr[1]; } if (preg_match("/src=\"(.*?)\"/ims", "_xxx" . $row, $arr)) { $url = $arr[1]; $options['cache_path'] = "/Volumes/Eli blue/eol_cache_2/"; if ($html = Functions::lookup_with_cache($url, $options)) { echo "\nexists:[{$url}]"; } else { echo "\nbroken: [{$url}]"; $broken[$id] = $url; } unset($options['cache_path']); } } } // if($i >= 3) break; //debug } print_r($broken); }
function get_list_of_user_ids() { // return array("30860816", "5810611"); // Laura F. [5810611], Eli Agbayani [30860816] , (User: [70505] - Ben Fawkes) has 100+ audio files $user_ids = array(); debug("\n Getting all members... " . $this->EOL_members); $offset = 0; while (true) { if ($json = Functions::lookup_with_cache($this->EOL_members . "&offset={$offset}", $this->download_options)) { $offset += 50; $users = json_decode($json); debug("\n members: " . count($users)); if (!$users) { break; } foreach ($users as $user) { $user_ids[(string) $user->id] = 1; } } else { debug("\n Connector terminated. Down: " . $this->EOL_members . "\n"); return array(); } } return array_keys($user_ids); }
private function get_usage_key_again($sciname) { if ($json = Functions::lookup_with_cache($this->gbif_taxon_info . $sciname . "&verbose=true", $this->download_options)) { $usagekeys = array(); $options = array(); $json = json_decode($json); if (!isset($json->alternatives)) { return false; } foreach ($json->alternatives as $rec) { if ($rec->canonicalName == $sciname) { $options[$rec->rank][] = $rec->usageKey; $usagekeys[] = $rec->usageKey; } } if ($options) { /* from NCBIGGIqueryAPI.php connector if(isset($options["FAMILY"])) return min($options["FAMILY"]); else return min($usagekeys); */ return min($usagekeys); } } return false; }
private function get_activities() { $items = array(); if ($html = Functions::lookup_with_cache($this->path['activities'], $this->download_options)) { //manual adjustment $html = self::clean_str(functions::remove_whitespace($html)); $html = str_ireplace('insect visitors</FONT></P>', 'insect visitors<BR><BR>', $html); $html = str_ireplace('<P ALIGN="LEFT"><FONT FACE="Times New Roman">prf', '<BR><BR>prf', $html); $html = strip_tags($html, "<BR>"); if (preg_match_all("/<BR>(.*?)<BR>/ims", $html, $arr)) { foreach ($arr[1] as $item) { if (preg_match("/xxx(.*?) =/ims", "xxx" . $item, $arr2)) { $items[trim($arr2[1])] = $item; } } } } return $items; }
public function write_alien_taxon($url) { $taxa_page_html = utf8_encode(Functions::lookup_with_cache($url, array('validation_regex' => '<body'))); $transformed_html = preg_replace("/<IMG( .*?)\\/?>/ims", "", $taxa_page_html); $transformed_html = preg_replace("/<P CLASS=\"style1\">(<B>)?(<I>)?<FONT (SIZE=\"-1\" )?COLOR=\"#(117711|007700|006600)\".*?>/", "<GREEN_COMMENT>", $transformed_html); $transformed_html = preg_replace("/<P (ALIGN=\"LEFT\" )?CLASS=\"style1( style1)*\">(<FONT COLOR=\"#000000\">){0,3}(<SPAN CLASS=\"style1\">)? ?<B>/", "<MARK>", $transformed_html); $references = $this->get_factsheet_references($transformed_html); // Taxon if ($url == self::ALIEN_TAXA_PREFIX . "fact/tama1.htm") { $taxon_ids = array(); $taxon_ids[] = $this->write_taxon("tamarix", "Tamarix", "Tamarix", "Tamaricaceae", "", $references); $taxon_ids[] = $this->write_taxon("tamarix_aphylla", "Tamarix aphylla", "Tamarix aphylla", "Tamaricaceae", "", $references); $taxon_ids[] = $this->write_taxon("tamarix_chinensis", "Tamarix chinensis", "Tamarix chinensis", "Tamaricaceae", "", $references); $taxon_ids[] = $this->write_taxon("tamarix_gallica", "Tamarix gallica", "Tamarix gallica", "Tamaricaceae", "", $references); $taxon_ids[] = $this->write_taxon("tamarix_parviflora", "Tamarix parviflora", "Tamarix parviflora", "Tamaricaceae", "", $references); $taxon_ids[] = $this->write_taxon("tamarix_ramosissima", "Tamarix ramosissima", "Tamarix ramosissima", "Tamaricaceae", "", $references); $taxon_id = "tamarix"; } elseif ($url == self::ALIEN_TAXA_PREFIX . "fact/loni1.htm") { $taxon_ids = array(); $taxon_ids[] = $this->write_taxon("lonicera", "Lonicera", "Lonicera", "Caprifoliaceae", $references); $taxon_ids[] = $this->write_taxon("lonicera_fragrantissima", "Lonicera fragrantissima", "Lonicera fragrantissima", "Caprifoliaceae", "fragrant honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_maackii", "Lonicera maackii", "Lonicera maackii", "Caprifoliaceae", "Amur honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_morrowii", "Lonicera morrowii", "Lonicera morrowii", "Caprifoliaceae", "Morrow's honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_standishii", "Lonicera standishii", "Lonicera standishii", "Caprifoliaceae", "Standish's honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_tatarica", "Lonicera tatarica", "Lonicera tatarica", "Caprifoliaceae", "Tartarian honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_xylosteum", "Lonicera xylosteum", "Lonicera xylosteum", "Caprifoliaceae", "European fly honeysuckle", $references); $taxon_ids[] = $this->write_taxon("lonicera_x_bella", "Lonicera X bella", "Lonicera X bella", "Caprifoliaceae", "pretty honeysuckle", $references); $taxon_id = "lonicera"; } else { $common_name = null; $scientific_name = null; $canonical_form = null; $family = null; if (preg_match("/body\" -->.*?<IMG.*? ALT=\"(.*?)\"/ims", $taxa_page_html, $arr)) { $common_name = trim($arr[1]); } else { echo "****COMMON\n"; } if (preg_match("/<FONT.*?SIZE=\"\\+1\">(.*?)<IMG/ims", $taxa_page_html, $arr)) { $scientific_name = trim(html_entity_decode($arr[1], ENT_QUOTES, 'UTF-8')); $scientific_name = trim(str_replace("\r", " ", $scientific_name)); $scientific_name = trim(str_replace("\n", " ", $scientific_name)); if (preg_match("/^(.*)<.*?>[a-z- ]* family *\\((.*?)(\\)|, formerly)/i", $scientific_name, $arr)) { $scientific_name = trim($arr[1]); $family = trim($arr[2]); } else { echo "****FAMILY\n"; } $scientific_name = str_replace("</FONT></I>", " ", $scientific_name); $scientific_name = str_replace("<I><BR>", "<I>", $scientific_name); $scientific_name = preg_replace("/<FONT.*?\\+1\">/ims", "", $scientific_name); if (preg_match("/^(.*?)<\\/FONT>/ims", $scientific_name, $arr)) { $scientific_name = trim($arr[1]); } if (preg_match("/^(.*?)<BR>/ims", $scientific_name, $arr)) { $scientific_name = trim($arr[1]); } if (preg_match("/^(.*?)\\(previously/ims", $scientific_name, $arr)) { $scientific_name = trim($arr[1]); } while (preg_match("/ /", $scientific_name)) { $scientific_name = str_replace(" ", " ", $scientific_name); } $scientific_name = str_replace(" </I>", "</I>", $scientific_name); $scientific_name = str_replace("<I> ", "<I>", $scientific_name); $scientific_name = preg_replace("/<\\/I>([^ ])/ims", "</I> \\1", $scientific_name); $scientific_name = str_replace(".<", ". <", $scientific_name); if (preg_match("/<I><EM>/", $scientific_name)) { $scientific_name = str_replace("<EM>", "", $scientific_name); $scientific_name = str_replace("</EM>", "", $scientific_name); } else { $scientific_name = str_replace("<EM>", "<I>", $scientific_name); $scientific_name = str_replace("</EM>", "</I>", $scientific_name); } // too many names if (preg_match("/,/", $scientific_name)) { return false; } list($scientific_name, $canonical_form, $taxon_id) = self::evaluate_scientific_name($scientific_name); } else { echo "****SCIENTIFIC\n"; } if (!$scientific_name || !$taxon_id) { return; } $this->write_taxon($taxon_id, $scientific_name, $canonical_form, $family, $common_name, $references); $taxon_ids = array($taxon_id); } echo "\n{$url}<br/>\n"; $authors = $this->get_factsheet_authors($transformed_html); $editors = $this->get_factsheet_editors($transformed_html); $write_options = array('taxon_ids' => $taxon_ids, 'url' => $url, 'authors' => $authors, 'editors' => $editors); if (preg_match("/NATIVE.*?RANGE(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(<\\/FONT>)?(.*?)</ims", $transformed_html, $arr)) { $native_range = trim(html_entity_decode($arr[3], ENT_QUOTES, 'UTF-8')); $native_range = trim(str_replace("\r", " ", $native_range)); $native_range = trim(str_replace("\n", " ", $native_range)); while (preg_match("/ /", $native_range)) { $native_range = str_replace(" ", " ", $native_range); } $this->write_alien_text('Native Range', $native_range, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#Distribution', $taxon_id . "/alien_range", $write_options); } else { echo "****NATIVE\n"; } if (preg_match("/> *DESCRIPTION(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $description = self::cleanse_alien_description($arr[2]); $this->write_alien_text('Description', $description, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#Morphology', $taxon_id . "/alien_description", $write_options); } else { echo "****DESCRIPTION\n"; } if (preg_match("/> *ECOLOGICAL.*?THREAT(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $threat = self::cleanse_alien_description($arr[2]); $this->write_alien_text('Ecological Threat in the United States', $threat, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#RiskStatement', $taxon_id . "/alien_threat", $write_options); } else { echo "****ECOLOGICAL\n"; } if (preg_match("/> *DISTRIBUTION.*?IN.*?THE.*?UNITED.*?STATES(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $distribution = self::cleanse_alien_description($arr[2]); $this->write_alien_text('Distribution in the United States', $distribution, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#Distribution', $taxon_id . "/alien_distribution", $write_options); } else { echo "****DISTRIBUTION\n"; } if (preg_match("/> *HABITAT.*?IN.*?THE.*?UNITED.*?STATES(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $habitat = self::cleanse_alien_description($arr[2]); $this->write_alien_text('Habitat in the United States', $habitat, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#Habitat', $taxon_id . "/alien_habitat", $write_options); } else { echo "****HABITAT\n"; } if (preg_match("/> *BACKGROUND(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $background = self::cleanse_alien_description($arr[2]); $this->write_alien_text('History in the United States', $background, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#TaxonBiology', $taxon_id . "/alien_background", $write_options); } else { echo "****BACKGROUND\n"; } if (preg_match("/> *BIOLOGY.*?(&|and).*?SPREAD(<BR>.*?<\\/B>|<\\/B>.*?<BR>)(.*?)<MARK>/ims", $transformed_html, $arr)) { $biology = self::cleanse_alien_description($arr[3]); $this->write_alien_text('Biology and Spread', $biology, 'http://rs.tdwg.org/ontology/voc/SPMInfoItems#Reproduction', $taxon_id . "/alien_biology", $write_options); } else { echo "****BIOLOGY\n"; } }
$taxa[] = new \SchemaTaxon($taxonParameters); //if($i >= 5) break; //debug } $new_resource_xml = \SchemaDocument::get_taxon_xml($taxa); $old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = Functions::file_open($old_resource_path, "w+"))) { return; } fwrite($OUT, $new_resource_xml); fclose($OUT); Functions::set_resource_status_to_force_harvest($resource_id); shell_exec("rm " . $new_resource_path); //-------- // 0x73 0x20 0x68 0x61 $xml_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if ($xml = Functions::lookup_with_cache($xml_path, array('timeout' => 1200, 'download_attempts' => 5, 'expire_seconds' => true))) { // $xml = str_replace(chr(0x73)." ".chr(0x20)." ".chr(0x73)." ".chr(0x6B), " ", $xml); // $xml = str_replace(array(chr(0x73), chr(0x20), chr(0x68), chr(0x61)), " ", $xml); $xml = str_replace(array(0x73, 0x20, 0x73, 0x6b), " ", $xml); $xml = str_replace(array(0x32, 0x35, 0x2e, 0x35), " ", $xml); $xml = str_replace(array(0x32, 0x33, 0x20, 0x6d), " ", $xml); $xml = str_replace(chr(0x32) . " " . chr(0x33) . " " . chr(0x20) . " " . chr(0x6d), " ", $xml); $xml = str_replace(array(0x20, 0x4e, 0x61, 0x74), " ", $xml); $xml = str_replace(array(0x73, 0x20, 0x68, 0x6f), " ", $xml); $xml = str_replace(chr(0x73) . " " . chr(0x20) . " " . chr(0x68) . " " . chr(0x6f), " ", $xml); $xml = str_replace(array(0x77, 0x65, 0x72, 0x65), " ", $xml); $xml = str_replace(array(0xe2, 0x80, 0xc2, 0xa6), " ", $xml); $xml = str_replace(array(0x6e, 0x20, 0x32, 0x30), " ", $xml); $xml = str_replace(array(0x67, 0x75, 0x65, 0x7a), " ", $xml); $xml = str_replace(array(0x73, 0x20, 0x61, 0x6e), " ", $xml); $xml = str_replace(array(0x74, 0x7a, 0x3c, 0x2f), " ", $xml);
private function query_family_NCBI_info($family, $is_subfamily, $database) { $rec["family"] = $family; $rec["source"] = $this->family_service_ncbi . $family; $rec["taxon_id"] = $family; $contents = Functions::lookup_with_cache($rec["source"], $this->download_options); if ($xml = simplexml_load_string($contents)) { if ($xml->Count > 0) { $rec["object_id"] = "_no_of_seq_in_genbank"; $rec["count"] = $xml->Count; $rec["label"] = "Number Of Sequences In GenBank"; $rec["measurement"] = "http://eol.org/schema/terms/NumberOfSequencesInGenBank"; self::save_to_dump($rec, $this->ggi_text_file[$database]["current"]); $rec["object_id"] = "SequenceInGenBank"; $rec["count"] = "http://eol.org/schema/terms/yes"; $rec["label"] = "SequenceInGenBank"; $rec["measurement"] = "http://eol.org/schema/terms/SequenceInGenBank"; self::save_to_dump($rec, $this->ggi_text_file[$database]["current"]); return true; } } if (!$is_subfamily) { $rec["object_id"] = "_no_of_seq_in_genbank"; self::add_string_types($rec, "Number Of Sequences In GenBank", 0, "http://eol.org/schema/terms/NumberOfSequencesInGenBank", $family); $rec["object_id"] = "SequenceInGenBank"; self::add_string_types($rec, "SequenceInGenBank", "http://eol.org/schema/terms/no", "http://eol.org/schema/terms/SequenceInGenBank", $family); self::has_diff_family_name_in_eol_api($family, $database); } self::check_for_sub_family($family); return false; }
private function create_cache($type, $id) { if ($type == "id_list") { $pagesize = 1000; // debug orig value max size is 1000; pagesize is the no. of records returned from Tropicos master list service $url = TROPICOS_API_SERVICE . "List?startid={$id}&PageSize={$pagesize}&apikey=" . TROPICOS_API_KEY . "&format=json"; } elseif ($type == "taxon_name") { $url = TROPICOS_API_SERVICE . $id . "?format=json&apikey=" . TROPICOS_API_KEY; } elseif ($type == "taxonomy") { $url = TROPICOS_API_SERVICE . $id . "/HigherTaxa?format=xml&apikey=" . TROPICOS_API_KEY; } elseif ($type == "synonyms") { $url = TROPICOS_API_SERVICE . $id . "/Synonyms?format=xml&apikey=" . TROPICOS_API_KEY; } elseif ($type == "taxon_ref") { $url = TROPICOS_API_SERVICE . $id . "/References?format=xml&apikey=" . TROPICOS_API_KEY; } elseif ($type == "distribution") { $url = TROPICOS_API_SERVICE . $id . "/Distributions?format=xml&apikey=" . TROPICOS_API_KEY; } elseif ($type == "images") { $url = TROPICOS_API_SERVICE . $id . "/Images?format=xml&apikey=" . TROPICOS_API_KEY; } elseif ($type == "chromosome") { $url = TROPICOS_API_SERVICE . $id . "/ChromosomeCounts?format=xml&apikey=" . TROPICOS_API_KEY; } if ($contents = Functions::lookup_with_cache($url, $this->download_options)) { return $contents; } else { return false; } }
private function prepare_contributor_galleries() { // get urls for each contributor $urls = array(); if ($html = Functions::lookup_with_cache($this->adw_page["contributor_galleries"], $this->download_options)) { if (preg_match("/<ol class=\"unstyled\">(.*?)<\\/ol>/ims", $html, $arr)) { if (preg_match_all("/<li>(.*?)<\\/li>/ims", $arr[1], $arr)) { foreach ($arr[1] as $block) { if (preg_match("/<a href=\"(.*?)\"/ims", $block, $temp)) { $urls[$temp[1]] = ''; } } } } } // loop to each contributor and get all media $i = 0; foreach (array_keys($urls) as $url) { $i++; /* breakdown when caching $m = 100; $cont = false; // if($i >= 1 && $i < $m) $cont = true; // if($i >= $m && $i < $m*2) $cont = true; // if($i >= $m*2 && $i < $m*3) $cont = true; Done if(!$cont) continue; */ $type = "pictures"; if ($url == "/collections/contributors/naturesongs/") { $type = "sounds"; } //manual adjustment, not images of taxa but of habitats if (in_array($url, array("/collections/contributors/habitat_images/"))) { continue; } echo "\ncontributor: [{$url}]\n"; $taxa_with_media = self::get_taxa_with_media($this->domain . $url); self::get_media_data($taxa_with_media, $type); // if($i > 5) break; //debug } }
private function get_links($url) { if (!($url = trim($url))) { return array(); } // for blank urls $links = array(); if ($html = Functions::lookup_with_cache($url, $this->download_options)) { if (preg_match_all("/<div class=\"ngg-albumtitle\">(.*?)<\\/div>/ims", $html, $arr)) { foreach ($arr[1] as $line) { $this->site_links_info[$url][] = $line; if (preg_match("/\"(.*?)\"/ims", $line, $arr2)) { $links[] = $arr2[1]; } } } if (preg_match_all("/<div class=\"ngg-thumbnail\">(.*?)<p><\\/p>/ims", $html, $arr)) { $this->site_thumbnails[$url] = $arr[1]; } if (preg_match_all("/<div class=\"ngg-gallery-thumbnail\" >(.*?)<\\/div>/ims", $html, $arr)) { // <h1 class="category-title">Blue Sponge; Haliclona species.</h1> if (preg_match("/<div class='ngg-navigation'>(.*?)<\\/div>/ims", $html, $arr3)) { $urls = self::get_pagination_links($arr3[1]); $this->pagination_links = array_merge($this->pagination_links, $urls); } if (preg_match("/<h1 class=\"category-title\">(.*?)<\\/h1>/ims", $html, $arr2)) { $this->site_thumbnails_gallery[$url]["title"] = $arr2[1]; $this->site_thumbnails_gallery[$url]["rekords"] = $arr[1]; } else { echo "\n investigate [{$url}] no title \n"; } } } return $links; }
private function get_taxa_list() { $taxa = array(); if ($html = Functions::lookup_with_cache($this->species_list, $this->download_options)) { if (preg_match_all("/<a class=\"fullname\"(.*?)<\\/a>/ims", $html, $arr)) { $rows = array_map('trim', $arr[1]); foreach ($rows as $row) { if (preg_match("/speciesCode=(.*?)\"/ims", $row, $arr)) { $id = $arr[1]; } if (preg_match("/\">(.*?)xxx/ims", $row . "xxx", $arr)) { $name = trim($arr[1]); } if (preg_match("/href=\"(.*?)\"/ims", $row, $arr)) { $source = $arr[1]; } $taxa[] = array("taxon_id" => $id, "sciname" => $name, "source" => $source); } } } return $taxa; }
private function parse_taxon_page($url) { $final = array(); $options = $this->download_options; if ($html = Functions::lookup_with_cache($url, $options)) { //get comnames if (preg_match("/<b>Other synonyms<\\/b>(.*?)<\\/font>/ims", $html, $arr)) { $temp = explode("<br>", $arr[1]); foreach ($temp as $t) { $rec = array(); if (preg_match("/<b>(.*?)<\\/b>/ims", $t, $arr)) { $rec['lang'] = trim(str_ireplace(":", "", $arr[1])); } $temp = explode("</b>", $t); // get string right side of '</b>' if ($val = @$temp[1]) { $comnames = explode(",", $val); $rec['comnames'] = array_map('trim', $comnames); } if ($rec) { $final[] = $rec; } } } //get authorship if (preg_match("/Citation:(.*?)<\\/p>/ims", $html, $arr)) { $authorship = Functions::remove_whitespace(strip_tags($arr[1])); $authorship = str_ireplace(' ', '', $authorship); } else { // no author! this assumes that a wrong file is cached; this merits a 2nd run of the connector $options['expire_seconds'] = 0; $html = Functions::lookup_with_cache($url, $options); echo "\nconnector has to run again\n"; } } return array('comnames' => $final, 'authorship' => $authorship); }
public static function get_eol_defined_uris($download_options = false) { if (!$download_options) { $download_options = array('resource_id' => 'URIs', 'download_wait_time' => 1000000, 'timeout' => 900, 'expire_seconds' => 86400, 'download_attempts' => 1); } //expires in 24 hours for ($i = 1; $i <= 15; $i++) { $urls = array(); // $urls[] = "http://localhost/cp/TraitRequest/measurements/URIs for Data on EOL - Encyclopedia of Life" . $i . ".html"; $urls[] = "https://dl.dropboxusercontent.com/u/7597512/TraitRequest/measurements/URIs for Data on EOL - Encyclopedia of Life" . $i . ".html"; foreach ($urls as $url) { if ($html = Functions::lookup_with_cache($url, $download_options)) { $html = str_ireplace("<wbr/>", "", $html); $params = array("<tr class='hidden' id='known_uri", "<tr id='known_uri"); foreach ($params as $param) { if (preg_match_all("/" . $param . "(.*?)<\\/tr>/ims", $html, $arr)) { foreach ($arr[1] as $t) { if (preg_match("/<td class='uri'>(.*?)<\\/td>/ims", $t, $arr2) || preg_match("/<td class='excluded uri'>(.*?)<\\/td>/ims", $t, $arr2)) { $val = ''; if (preg_match("/<td>(.*?)<\\/td>/ims", $t, $arr3)) { $val = $arr3[1]; } $rec[$arr2[1]] = $val; } } } } } else { return $rec; } } } return $rec; }
private function barcode_image_available($src) { $str = Functions::lookup_with_cache($src, $this->download_options); /* ERROR: Only species level taxids are accepted ERROR: Unable to retrieve sequence */ if (is_numeric(stripos($str, "ERROR:"))) { return false; } else { return true; } }
private function get_image_urls($rec) { $mediaURLs = array(); $url = $this->images_path . "&species=" . $rec["Species"] . "&navi="; if ($html = Functions::lookup_with_cache($url . "1", $this->download_options)) { $navi = 1; if (preg_match("/>1 of (.*?)<\\/font/ims", $html, $arr)) { $navi = trim($arr[1]); } for ($i = 1; $i <= $navi; $i++) { if ($i == 1) { if (preg_match_all("/src=\"(.*?)\"/ims", $html, $arr)) { $mediaURLs = array_merge($mediaURLs, $arr[1]); } } else { if ($html = Functions::lookup_with_cache($url . $i, $this->download_options)) { if (preg_match_all("/src=\"(.*?)\"/ims", $html, $arr)) { $mediaURLs = array_merge($mediaURLs, $arr[1]); } } } } } return $mediaURLs; }
/* Expects: $params Array ( [search_type] => gen_archive_all [archive_id] => BHL_lit_2016_07_21_01_35_41 ) */ // echo "<pre>"; print_r($params); echo "</pre>"; $url = EOL_PHP_CODE . "update_resources/connectors/mediawiki.php?archive_id=" . urldecode($params['archive_id']); ?> <div id="accordion_open2"> <h3>Generate EOL DWC-A</h3> <div> <?php if ($val = Functions::lookup_with_cache($url, array('expire_seconds' => true))) { if (strpos($val, "[SUCCESS]") !== false) { self::display_message(array('type' => "highlight", 'msg' => "EOL DWC-A successfully generated.")); $url = EOL_PHP_CODE . "applications/content_server/resources/" . str_replace(array(":", " "), "_", $params['archive_id']) . ".tar.gz"; echo "<br>You can copy the URL below and use it as a resource in an EOL Content Partner resource account (<a target='eol' href='http://eol.org'>eol.org</a>)."; echo "<br><br><a href='" . $url . "'>{$url}</a>"; $url = EOL_PHP_CODE . "applications/dwc_validator/index.php?file_url=" . $url; echo "<br><br>You can also try to validate the archive file <a target='_blank' href='" . $url . "'>here</a>."; } else { self::display_message(array('type' => "error", 'msg' => "Process un-successful.")); // echo "<br>[$val]<br>"; //debug } } else { self::display_message(array('type' => "error", 'msg' => "Process un-successful.")); // echo "<br>[$val]<br>"; //debug }
function generate_licensor_title_list() { $recs = array(); $url = "https://docs.google.com/spreadsheets/u/1/d/1ExBu0Q9yLXsYVNzXdIrDYt2Go6blwftAEEb5kJk-dfk/pub?output=html"; $html = Functions::lookup_with_cache($url, array('expire_seconds' => 86400, 'download_wait_time' => 1000000)); //expires every 24 hours if (preg_match_all("/<tr style\\=\\'height\\:1px\\;\\'>(.*?)<\\/tr>/ims", $html, $arr)) { foreach ($arr[1] as $t) { if (preg_match_all("/<td (.*?)<\\/td>/ims", $t, $arr2)) { $a = $arr2[1]; $temp1 = explode(">", $a[1]); $temp2 = explode(">", $a[2]); $recs[$temp2[1]] = $temp1[1]; } } } return $recs; }
function load_zip_contents() { $this->TEMP_FILE_PATH = create_temp_dir() . "/"; if ($file_contents = Functions::lookup_with_cache($this->zip_path, array('timeout' => 172800, 'download_attempts' => 5))) { $parts = pathinfo($this->zip_path); $temp_file_path = $this->TEMP_FILE_PATH . "/" . $parts["basename"]; if (!($TMP = fopen($temp_file_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $temp_file_path); return; } fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("unzip {$temp_file_path} -d {$this->TEMP_FILE_PATH}"); if (!file_exists($this->TEMP_FILE_PATH . "/Hds1-Hymenoptera-Final.txt")) { $this->TEMP_FILE_PATH = str_ireplace(".zip", "", $temp_file_path); if (!file_exists($this->TEMP_FILE_PATH . "/Hds1-Hymenoptera-Final.txt")) { return false; } } $this->text_path[1] = $this->TEMP_FILE_PATH . "/Hds1-Hymenoptera-Final.txt"; $this->text_path[2] = $this->TEMP_FILE_PATH . "/Hds2-Hymenoptera-Final.txt"; $this->text_path[3] = $this->TEMP_FILE_PATH . "/Hds3-Hymenoptera-Final.txt"; $this->text_path[4] = $this->TEMP_FILE_PATH . "/Hds4-Hymenoptera-Final.txt"; $this->text_path[5] = $this->TEMP_FILE_PATH . "/Hds5-Hymenoptera-Final.txt"; $this->text_path[6] = $this->TEMP_FILE_PATH . "/Hds6-Hymenoptera-Final.txt"; $this->text_path[7] = $this->TEMP_FILE_PATH . "/Hds7-Hymenoptera-Final.txt"; $this->text_path[8] = $this->TEMP_FILE_PATH . "/Hds8-Hymenoptera-Final.txt"; $this->text_path[9] = $this->TEMP_FILE_PATH . "/Hds9-Hymenoptera-Final.txt"; $this->text_path[10] = $this->TEMP_FILE_PATH . "/Hds10-Hymenoptera-Final.txt"; $this->text_path[11] = $this->TEMP_FILE_PATH . "/HymEcoParDone.txt"; return true; } else { debug("\n\n Connector terminated. Remote files are not ready.\n\n"); return false; } }
function load_zip_contents() { $this->TEMP_FILE_PATH = create_temp_dir() . "/"; $download_options = $this->download_options; $download_options['expire_seconds'] = 1728000; // expire_seconds = 20 days in normal operation 1728000 if ($file_contents = Functions::lookup_with_cache($this->fishbase_data, $download_options)) { $temp_file_path = $this->TEMP_FILE_PATH . "/fishbase.zip"; if (!($TMP = Functions::file_open($temp_file_path, "w"))) { return; } fwrite($TMP, $file_contents); fclose($TMP); $output = shell_exec("unzip {$temp_file_path} -d {$this->TEMP_FILE_PATH}"); if (!file_exists($this->TEMP_FILE_PATH . "/taxon.txt")) { $this->TEMP_FILE_PATH = str_ireplace(".zip", "", $temp_file_path); if (!file_exists($this->TEMP_FILE_PATH . "/taxon.txt")) { return; } } $this->text_path['TAXON_PATH'] = $this->TEMP_FILE_PATH . "/taxon.txt"; $this->text_path['TAXON_COMNAMES_PATH'] = $this->TEMP_FILE_PATH . "/taxon_comnames.txt"; $this->text_path['TAXON_DATAOBJECT_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject.txt"; $this->text_path['TAXON_DATAOBJECT_AGENT_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject_agent.txt"; $this->text_path['TAXON_DATAOBJECT_REFERENCE_PATH'] = $this->TEMP_FILE_PATH . "/taxon_dataobject_reference.txt"; $this->text_path['TAXON_REFERENCES_PATH'] = $this->TEMP_FILE_PATH . "/taxon_references.txt"; $this->text_path['TAXON_SYNONYMS_PATH'] = $this->TEMP_FILE_PATH . "/taxon_synonyms.txt"; } else { echo "\n\n Connector terminated. Remote files are not ready.\n\n"; return; } }