private function format_string($string) { return str_ireplace(array("\n"), " ", trim(Functions::remove_whitespace($string))); }
private function parse_taxon_page($url) { $final = array(); $options = $this->download_options; if ($html = Functions::lookup_with_cache($url, $options)) { //get comnames if (preg_match("/<b>Other synonyms<\\/b>(.*?)<\\/font>/ims", $html, $arr)) { $temp = explode("<br>", $arr[1]); foreach ($temp as $t) { $rec = array(); if (preg_match("/<b>(.*?)<\\/b>/ims", $t, $arr)) { $rec['lang'] = trim(str_ireplace(":", "", $arr[1])); } $temp = explode("</b>", $t); // get string right side of '</b>' if ($val = @$temp[1]) { $comnames = explode(",", $val); $rec['comnames'] = array_map('trim', $comnames); } if ($rec) { $final[] = $rec; } } } //get authorship if (preg_match("/Citation:(.*?)<\\/p>/ims", $html, $arr)) { $authorship = Functions::remove_whitespace(strip_tags($arr[1])); $authorship = str_ireplace(' ', '', $authorship); } else { // no author! this assumes that a wrong file is cached; this merits a 2nd run of the connector $options['expire_seconds'] = 0; $html = Functions::lookup_with_cache($url, $options); echo "\nconnector has to run again\n"; } } return array('comnames' => $final, 'authorship' => $authorship); }
private function clean_html($html) { $html = str_ireplace(array("\n", "\r", "\t", "\\o", "\\xOB", "\t", "\t"), "", trim($html)); return Functions::remove_whitespace($html); }
private function format_typeStatus($value) { $value = trim(Functions::remove_whitespace($value)); if (is_numeric(stripos($value, " ")) || is_numeric(stripos($value, "/"))) { $measurement_remarks = $value; } else { $measurement_remarks = ""; } $value = trim(strtoupper($value)); $value = str_ireplace(array("[", "]", "!"), "", $value); $value = str_ireplace(" ?", "?", $value); $value = str_ireplace("TYPES", "TYPE", $value); $value = str_ireplace("PROBABLE", "POSSIBLE", $value); $value = str_ireplace("NEOTYPE COLLECTION", "NEOTYPE", $value); $value = str_ireplace("TYPE.", "TYPE", $value); if (substr($value, 0, 8) == "TYPE OF ") { $value = "TYPE"; } elseif (substr($value, 0, 11) == "SYNTYPE OF ") { $value = "SYNTYPE"; } elseif (substr($value, 0, 17) == "SCHIZOSYNTYPE OF ") { $value = "SCHIZOSYNTYPE"; } elseif (substr($value, 0, 18) == "SCHIZOPARATYPE OF ") { $value = "SCHIZOPARATYPE"; } elseif (in_array($value, array("TYPE, NO. 15 = LECTOTYPE", "TYPE, NO.17 = LECTOTYPE", "LECTOTYPE/TYPE", "LECTOTYPE, TYPE"))) { $value = "LECTOTYPE"; } elseif (in_array($value, array("SYNTYTPE", "SYTNTYPE", "SYNYPES", "SYNTPE", "SYNTYPE MAMILLATA"))) { $value = "SYNTYPE"; } elseif (in_array($value, array("PARALECTO", "PARALECTOYPES", "PARALECTOYPE"))) { $value = "PARALECTOTYPE"; } elseif (in_array($value, array("SYNTYPE OR HOLOTYPE?", "?HOLOTYPE OR SYNTYPE", "HOLOTYPE OR SYNTYPE", "SYNTYPE OR HOLOTYPE"))) { $value = "SYNTYPE? + HOLOTYPE?"; } elseif (in_array($value, array("POSS./PROB. PARALECTOTYPE", "PARALECTOTYPE (POSSIBLE)", "POSSIBLE PARALECTOTYPE", "?PARALECTOTYPE"))) { $value = "PARALECTOTYPE?"; } elseif (in_array($value, array("PT OF HOLOTYPE", "PART OF HOLOTYPE", "HOLOTYPE (PART)"))) { $value = "HOLOTYPE FRAGMENT"; } elseif (in_array($value, array("PART OF TYPE", "PT OF TYPE", "PART OF TYPE MATERIAL", "PT OF TYPE MATERIAL", "TYPE (PART)"))) { $value = "TYPE FRAGMENT"; } elseif (in_array($value, array("?PT OF TYPE?", "?PT OF TYPE OF REGULARIS?"))) { $value = "UNCONFIRMED TYPE"; } elseif (in_array($value, array("TYPE - HOLOTYPE", "HOLOTYPE LIMNOTRAGUS SELOUSI", "COTYPE (HOLOTYPE", "HOLOTYPE, TYPE"))) { $value = "HOLOTYPE"; } elseif (in_array($value, array("SYNYTPE", "FIGURED SYNTYPE"))) { $value = "SYNTYPE"; } elseif (in_array($value, array("TOPTYPE", "TOPOTYPICAL"))) { $value = "TOPOTYPE"; } elseif (in_array($value, array("COTYPUS", "CO-TYPE"))) { $value = "COTYPE"; } elseif ($value == "POSSIBLE COTYPE (FIDE M. R. BROWNING)") { $value = "POSSIBLE COTYPE"; } elseif ($value == "SYNTYPE OR PARALECTOTYPE") { $value = "SYNTYPE? + PARALECTOTYPE?"; } elseif ($value == "SYNTYPE OR LECTOTYPE") { $value = "SYNTYPE? + LECTOTYPE?"; } elseif ($value == "TOPOTYPE (STATED BY THE DONOR TO BE PARATYPE)") { $value = "TOPOTYPE? + PARATYPE?"; } elseif ($value == "HOLOTYPE/PARATYPE?") { $value = "HOLOTYPE + PARATYPE?"; } elseif ($value == "HOLOTYPE/SYNTYPE") { $value = "HOLOTYPE + SYNTYPE"; } elseif ($value == "SYNTYPE/HOLOTYPE") { $value = "HOLOTYPE + SYNTYPE"; } elseif ($value == "HOLOTYPE/LECTOTYPE") { $value = "HOLOTYPE + LECTOTYPE"; } elseif ($value == "NEOTYPE (POSSIBLE)") { $value = "NEOTYPE?"; } elseif ($value == "LECTOTYPE (POSSIBLE)") { $value = "LECTOTYPE?"; } elseif ($value == "ALLOTYPE (POSSIBLE)") { $value = "ALLOTYPE?"; } elseif ($value == "ORIGINAL MATERIAL.") { $value = "ORIGINALMATERIAL"; } elseif ($value == "PART OF LECTOTYPE") { $value = "LECTOTYPE FRAGMENT"; } elseif ($value == "PART OF PARATYPE") { $value = "PARATYPE FRAGMENT"; } elseif ($value == "ISTOTYPE") { $value = "ISOTYPE"; } elseif ($value == "PARATYPE (ALLOTYPE)") { $value = "ALLOTYPE"; } elseif (in_array($value, array("PARATYPE #5", "PARATYPE V", "PARATYPE I", "PARATYPE II", "PARATYPE #2", "PARATYPE #3", "PARATYPE (NO.52)", "PARATYPE #1", "PARATYPE #9", "PARATYPE III", "PARATYPE II AND III", "PARATYPE III AND IV", "PARATYPE #10", "PARATYPE #7", "PARATYPE #4", "PARATYPE #6", "PARATYPE (NO.65)", "PARATYPE #8", "PARAYPE", "PARATYPE)"))) { $value = "PARATYPE"; } return array("type_status" => $value, "measurement_remarks" => $measurement_remarks); }
private function assign_reference($html) { if (preg_match("/references:(.*?)<hr/ims", $html, $arr)) { $html = $arr[1]; } // process scientific articles $scientific_articles = array(); if (preg_match("/scientific articles:(.*?)xxx/ims", $html . "xxx", $arr)) { $html2 = str_replace(array("\n", " "), " ", $arr[1]); $html2 = str_ireplace(array("Dichotomous Keys:", "General References:", "Scientific Articles:", "Web sites:"), "", $html2); $html2 = Functions::remove_whitespace($html2); if (preg_match_all("/<p>(.*?)<\\/p>/ims", $html2, $arr)) { $temp = array_map('strip_tags', $arr[1]); $temp = array_map('trim', $temp); $temp = array_filter($temp); //remove null arrays $temp = array_values($temp); //reindex key if ($temp) { $scientific_articles = $temp; } } } // end scientific articles //1st option: the one with #, e.g. Cancer_gracilis.html $option1 = array(); if (preg_match_all("/html\\#(.*?)\"/ims", $html, $arr)) { $option1 = array_map('urldecode', $arr[1]); } if ($option1) { return array_merge($option1, $scientific_articles); } //2nd option: the one without hyperlinks, e.g. Paranemertes_peregrina.html $temp = explode("<br> ", $html); $temp = array_map('strip_tags', $temp); $final = array(); foreach ($temp as $t) { $t = explode("\n", $t); $t = trim($t[0]); $final[$t] = ''; } $option2 = array_keys($final); $option2 = array_filter($option2); //remove null values if ($option2) { return array_merge($option2, $scientific_articles); } //3rd option: the one with actual reference body, e.g. Neognathophausia_ingens.html $temp = explode("<p>", $html); $temp = array_map('strip_tags', $temp); $final = array(); foreach ($temp as $t) { $t = self::clean_string($t); $t = str_ireplace(array("Dichotomous Keys:", "General References:", "Scientific Articles:", "Web sites:"), "", $t); $final[] = trim($t); } $final = array_map('trim', $final); $option3 = array_filter($final); //remove null values if ($option3) { $temp = array_merge($option3, $scientific_articles); $temp = array_filter($temp); //remove null arrays $temp = array_unique($temp); //make unique $temp = array_values($temp); //reindex key return $temp; } return array(); }
private function get_taxon_descriptions_from_LD_taxon_pages($params) { // for biblio spreadsheet $biblios = self::get_row_from_spreadsheet(@$params['scratchpad_biblio'], "Title"); // $headers = self::get_column_headers($this->file_importer_xls["text"]); $headers = $this->lifedesk_fields["text"]; // for stats $parts = pathinfo($this->text_path["eol_xml"]); $dump_file = $parts["dirname"] . "/images_not_in_xls2.txt"; $options = $this->download_options; $options['expire_seconds'] = $this->LD_nodes_pages_expire_seconds; // lookup to LifeDesk page should not expire unless requested to have a fresh export to scratchpad $options['download_wait_time'] = 2000000; $topics = array(); $records = array(); //start accessing individual taxon page in LD if ($pages = self::get_nodes_or_pages("taxa", $params, $options)) { $total = count($pages); $i = 0; foreach ($pages as $page) { $i++; echo "\n{$i} of {$total} --- page: [{$page}]"; $sciname = false; // <h3 class="taxonpage">Distribution</h3> if ($html = Functions::lookup_with_cache("http://" . $params["name"] . ".lifedesks.org/pages/{$page}", $options)) { /* getting just topics -- working if(preg_match_all("/<h3 class=\"taxonpage\">(.*?)<\/h3>/ims", $html, $arr)) { $topics = array_merge($topics, $arr[1]); $topics = array_unique($topics); } */ // /* $rec = array(); $html = str_ireplace('<div class="taxonpage-children">', '<div class="sub-chapter"><div class="taxonpage-children">', $html); if (preg_match_all("/<h3 class=\"taxonpage\">(.*?)<div class=\"sub-chapter\">/ims", $html, $arr)) { $sections = $arr[1]; foreach ($sections as $section) { $str = strip_tags($section, "<p><em><h3>"); $str = str_ireplace(array('Comment (0)', "\n"), '', $str); $str = Functions::remove_whitespace($str); $parts = explode("</h3>", $str); $parts = array_map('trim', $parts); $rec[$parts[0]] = $parts[1]; } } // else echo "\nno articles\n"; working... if (preg_match("/<h1 class=\"taxonpage\">(.*?)<\\/h1>/ims", $html, $arr)) { $sciname = trim(strip_tags($arr[1])); $records[$sciname]["articles"] = $rec; // assigning objects to sciname $records[$sciname]["page"] = $page; // assigning page to sciname } // */ // /* // DATA-1552 if (@$params['scratchpad_biblio'] && $sciname) { // if(preg_match("/<h2 class=\"taxonpage\">References<\/h2>(.*?)title=\"About this site\">About this site<\/a>/ims", $html, $arr)) if (true) { // $html = $arr[1]; foreach ($biblios as $biblio) { if (is_numeric(stripos($html, $biblio)) || is_numeric(stripos(strip_tags($html), $biblio)) || is_numeric(stripos($html, strip_tags($biblio))) || is_numeric(stripos(strip_tags($html), strip_tags($biblio)))) { $this->biblio_taxa[$biblio][] = $sciname; echo "\nwith biblio taxa 1\n"; } else { $html = str_ireplace(array("\n"), "", $html); if (is_numeric(stripos($html, $biblio)) || is_numeric(stripos(strip_tags($html), $biblio)) || is_numeric(stripos($html, strip_tags($biblio))) || is_numeric(stripos(strip_tags($html), strip_tags($biblio)))) { $this->biblio_taxa[$biblio][] = $sciname; echo "\nwith biblio taxa 2\n"; } } } } } // */ // /* // DATA-1554 if (@$params['scratchpad_taxonomy'] && $sciname) { if (preg_match_all("/biblio\\/view\\/(.*?)\"/ims", $html, $arr)) { if ($val = @$this->taxonomy_biblio[$sciname]) { $this->taxonomy_biblio[$sciname] = array_merge($val, $arr[1]); } else { $this->taxonomy_biblio[$sciname] = $arr[1]; } } } // */ } } //foreach page } $topics = array_unique($topics); if ($records) { self::save_taxon_articles_to_text($records, $headers); } }
private function parse_href_string($str) { // <a href="taxa/scaritinae">Scaritinae Bonelli, 1810 $rek = array(); if (preg_match("/href=\"(.*?)\"/ims", $str, $arr)) { $rek['href'] = $arr[1]; } if (preg_match("/>(.*?)xxx/ims", $str . 'xxx', $arr)) { $rek['name'] = Functions::remove_whitespace($arr[1]); } return $rek; }
private function prepare_texts($taxa) { $k = 0; foreach ($taxa as $taxon) { $k++; if ($k % 10 == 0) { echo "\n {$k} - "; } $taxon = array_map('trim', $taxon); // if($taxon['sciname'] == "Liomys salvini") continue; //not insect, will process independently // $taxon['href'] = "/paginas-de-especies/insectos/104-nymphalidae/670-i-caligo-illioneus-i-nymphalidae"; //debug //create taxon $taxon['sciname'] = self::clean_name($taxon['sciname']); $taxon['taxon_id'] = strtolower(str_replace(" ", "_", $taxon['sciname'])); $t = new \eol_schema\Taxon(); $t->taxonID = $taxon['taxon_id']; $t->scientificName = $taxon['sciname']; $t->family = $taxon['family']; if (!@$taxon['family']) { print_r($taxon); exit; } $t->furtherInformationURL = $this->acg_domain . $taxon['href']; if (!isset($this->taxa_ids[$t->taxonID])) { $this->taxa_ids[$t->taxonID] = ''; $this->archive_builder->write_object_to_file($t); } // start prepare objects - image, text if ($html = Functions::lookup_with_cache($this->acg_domain . $taxon['href'], $this->download_options)) { $html = str_ireplace(array(" ", " ", ' colspan="2"', ' rowspan="1"', ' style="margin-bottom: 0in;"', ' lang="es-CR"', ' style="text-align: center;"', ' style="line-height: 1.3em;"'), "", $html); $html = str_ireplace("Fig..", "Fig.", $html); if (preg_match("/<h1 class=\"titulo-articulo nombreCientifico\">(.*?)<a class=\"top\" href=\"#arriba\">/ims", $html, $match)) { $str = strip_tags($match[1], "<p><td><tr><table><i><img>"); $str = str_ireplace(' min-height: 14px;', '', $str); $str = str_ireplace(' style="margin: 0px; line-height: normal; font-family: Helvetica;"', '', $str); // e.g. $i == 10 $str = str_ireplace(' class="p1"', '', $str); $str = str_ireplace(' style="text-align: justify;"', '', $str); //e.g. Aellopos ceculus (Sphingidae) $str = str_ireplace(' style="font: normal normal normal 12px/normal Helvetica; margin: 0px;"', '', $str); //e.g. Calydna sturnula (Riodinidae) $str = str_ireplace(" 'Helvetica Neue'; margin: 0px;", '', $str); $str = str_ireplace(' style="font: normal normal normal 12px/normal"', '', $str); $str = str_ireplace(' style="font: normal normal normal 18px/normal"', '', $str); if (preg_match_all("/<p>(.*?)<\\/p>/ims", $str, $match2)) { $total_txt = $match2[1]; $i = 0; foreach ($total_txt as $r) { if (is_numeric(stripos($r, "src="))) { $total_txt[$i] = ""; } $i++; } if (count($total_txt) < 4) { echo "\ninvestigate txt: {$this->acg_domain}" . "{$taxon['href']} \n"; } $total_txt = array_map('trim', $total_txt); $total_txt = array_filter($total_txt); $total_txt = array_values($total_txt); //remove <p> in each array value $i = 0; foreach ($total_txt as $t) { $total_txt[$i] = str_ireplace("<p>", "", $t); $i++; } } $total_img = array(); if (preg_match_all("/<td><img(.*?)<\\/td>/ims", $str, $match2)) { $total_img = $match2[1]; } if (preg_match_all("/<p><img(.*?)<\\/p>/ims", $str, $match2)) { $total_img = array_merge($total_img, $match2[1]); } if (count($total_img) < 4) { echo "\ninvestigate img: {$this->acg_domain}" . "{$taxon['href']} \n"; } $caption_src = array(); $final_images = array(); foreach ($total_img as $img) { $src = false; if (preg_match("/src=\"(.*?)\"/ims", $img, $match2)) { $src = $this->acg_domain . trim(str_ireplace("miniaturas/peq_", "", $match2[1])); } if (preg_match("/\\/>(.*?)\\(Click en la imágen para expandir\\)/ims", $img, $match2)) { $caption = Functions::remove_whitespace($match2[1]); } // manual adjustment $caption = self::adjust_caption($caption); if (is_numeric(stripos($src, "http://www.acguanacaste.ac.crdata:image/png"))) { continue; } if ($src) { $final_images[] = array("src" => $src, "caption" => $caption); // assign Figure # with src if (preg_match("/Figura (.*?)\\./ims", $caption, $match2)) { $caption_src[self::get_number_only($match2[1])] = $src; } elseif (preg_match("/Fig.(.*?)\\./ims", $caption, $match2)) { $caption_src[self::get_number_only($match2[1])] = $src; } elseif (preg_match("/Fig(.*?)\\./ims", $caption, $match2)) { $caption_src[self::get_number_only($match2[1])] = $src; } elseif (preg_match("/figura (.*?)\\./ims", $caption, $match2)) { $caption_src[self::get_number_only($match2[1])] = $src; } /* debug if($src == "http://www.acguanacaste.ac.cr/images/species-home-page/Xylophanes-porcus/12-SRNP-43295-DHJ707153.JPG") { echo "\n" . self::get_number_only($match2[1]); echo "\n $caption"; echo "\n $src"; exit; } */ } } if ($final_images) { self::create_image_text_objects($final_images, $taxon, "image"); } // start adding image links at the end of text objects $i = 0; foreach ($total_txt as $txt) { $total_txt[$i] = self::add_image_links($txt, $caption_src); $i++; } if ($total_txt) { $total_txt = array_map('trim', $total_txt); $desc = Functions::remove_whitespace(implode("<p>", $total_txt)); self::create_image_text_objects(array(0 => $desc), $taxon, "text"); } } else { echo "\ninvestigate: no articulo sphomep \n"; } } else { echo "\ninvestigate: page not found \n"; } } }
private function clean_string($string, $type = false) { $string = str_ireplace(array("\r\n", "\n", "\r", "\t", "", "\v", "\t"), '', $string); if ($type == "name") { $string = strip_tags($string); } return trim(Functions::remove_whitespace($string)); }
private function format_wiki_substr($substr) { return str_replace(array("\n", "\t"), "", Functions::remove_whitespace($substr)); }
private function process_urls($urls) { $i = 0; $total = count($urls); foreach ($urls as $url) { $i++; echo "\n - {$i} of {$total} [{$url}]\n"; if (isset($this->stored_offline_urls[$url])) { continue; } /* breakdown when caching debug $cont = false; // if($i >= 1 && $i < 1000) $cont = true; // if($i >= 1000 && $i < 2000) $cont = true; // if($i >= 2000 && $i < 3000) $cont = true; // if($i >= 3000 && $i < 4000) $cont = true; if(!$cont) continue; */ if ($html = Functions::lookup_with_cache($url, $this->download_options)) { $html = trim(str_ireplace(array(' align="center"', ' class="style1"', ' class="style2"'), "", $html)); if (preg_match("/>Family: (.*?)xxx/ims", $html . "xxx", $arr)) { $rec["source"] = $url; if (preg_match("/<h1>(.*?)<\\/h1>/ims", $html, $arr) || preg_match("/<FONT FACE=\"Arial\">(.*?)<\\/FONT>/ims", $html, $arr) || preg_match("/<h2>(.*?)<\\/h2>/ims", $html, $arr)) { $sciname = Functions::remove_whitespace(strip_tags($arr[1])); $sciname = trim(str_replace(array(chr(13), chr(10)), " ", $sciname)); //manual adjustments if ($sciname == "Small Winter Stoneflies") { $sciname = "Allocapnia sp."; } if ($sciname == "Embioptera: Family Oligotomidae") { $sciname = "Oligotomidae"; } $sciname = trim($sciname); $to_exclude = array("cf.", "sp.", "Unidentified Stonefly", "Family"); $include = true; foreach ($to_exclude as $exclude) { if (is_numeric(stripos($sciname, $exclude))) { $include = false; } } if (!$include) { continue; } $sciname = Functions::canonical_form($sciname); // only species-level if (stripos($sciname, " ") === false) { continue; } // $images = self::parse_images($html, $url); working... temporarily commented $images = array(); $info = self::parse_texts($html, $url); $lengths = @$info["lengths"]; $wingspan = @$info["wingspan"]; if ($images || $lengths) { $r = array(); $r["sciname"] = $sciname; $r["taxon_id"] = str_replace(" ", "_", $r["sciname"]); $r["source"] = $url; self::create_instances_from_taxon_object($r); $r["images"] = $images; $r["lengths"] = $lengths; $r["wingspan"] = $wingspan; if ($lengths) { self::prepare_length_structured_data($r); } if ($images) { self::prepare_image_objects($r); } } } else { echo "\n investigate: no sciname [{$url}]"; } } } else { self::save_to_dump($url, $this->current_offline_urls_dump_file); } } // print_r($this->debug); print "\n count:" . count($this->debug) . "\n"; }
private function parse_distribution_map($str, $html, $t, $group) { if (preg_match_all("/{$str}(.*?)<\\/li>/ims", $html, $arr)) { // echo "\n";echo $html;echo "\n"; exit; foreach ($arr[1] as $line) { if (strpos($line, "MapGen") === false) { continue; } // only distribution maps will be processed $rec = array(); $rec["taxon_id"] = (string) $t["ID"]; $rec["media_url"] = ""; if (preg_match("/href=\"(.*?)\"/ims", $line, $arr2)) { $rec["source_url"] = $arr2[1]; } if (preg_match("/src=\"(.*?)\"/ims", $line, $arr2)) { $rec["media_url"] = str_ireplace("Width=120", "Width=960", $arr2[1]); } if (preg_match("/<\\/a>(.*?)xxx/ims", $line . "xxx", $arr2)) { $rec["description"] = "Note: Distribution maps are often incomplete due to the workload of entering data.<br>" . trim($arr2[1]) . "<br>"; $rec["description"] .= self::map_legend(); $rec["description"] = trim(str_ireplace(array(chr(9), chr(10), chr(13)), "", $rec["description"])); $rec["description"] = Functions::remove_whitespace($rec["description"]); } $rec["media_id"] = "map_" . $rec["taxon_id"]; $rec["title"] = "Distribution for " . $t["Rank"] . " " . self::get_scientific_name($t); $rec = array_map('trim', $rec); $rec["group"] = $group; $rec["subtype"] = "Map"; $rec["citation"] = self::parse_citation($html); $rec["CreativeCommons"] = "BY-SA"; //distribution maps is be default BY-SA, until discovered if ($rec["media_url"]) { self::create_data_object($rec); } else { echo "\n investigate no map"; echo "\n {$str}"; echo "\n {$html}"; echo "\n [" . $rec["taxon_id"] . "] [{$group}]\n"; } } } }