예제 #1
0
 private function format_string($string)
 {
     return str_ireplace(array("\n"), " ", trim(Functions::remove_whitespace($string)));
 }
예제 #2
0
 private function parse_taxon_page($url)
 {
     $final = array();
     $options = $this->download_options;
     if ($html = Functions::lookup_with_cache($url, $options)) {
         //get comnames
         if (preg_match("/<b>Other synonyms<\\/b>(.*?)<\\/font>/ims", $html, $arr)) {
             $temp = explode("<br>", $arr[1]);
             foreach ($temp as $t) {
                 $rec = array();
                 if (preg_match("/<b>(.*?)<\\/b>/ims", $t, $arr)) {
                     $rec['lang'] = trim(str_ireplace(":", "", $arr[1]));
                 }
                 $temp = explode("</b>", $t);
                 // get string right side of '</b>'
                 if ($val = @$temp[1]) {
                     $comnames = explode(",", $val);
                     $rec['comnames'] = array_map('trim', $comnames);
                 }
                 if ($rec) {
                     $final[] = $rec;
                 }
             }
         }
         //get authorship
         if (preg_match("/Citation:(.*?)<\\/p>/ims", $html, $arr)) {
             $authorship = Functions::remove_whitespace(strip_tags($arr[1]));
             $authorship = str_ireplace('&nbsp;', '', $authorship);
         } else {
             // no author! this assumes that a wrong file is cached; this merits a 2nd run of the connector
             $options['expire_seconds'] = 0;
             $html = Functions::lookup_with_cache($url, $options);
             echo "\nconnector has to run again\n";
         }
     }
     return array('comnames' => $final, 'authorship' => $authorship);
 }
예제 #3
0
 private function clean_html($html)
 {
     $html = str_ireplace(array("\n", "\r", "\t", "\\o", "\\xOB", "\t", "\t"), "", trim($html));
     return Functions::remove_whitespace($html);
 }
예제 #4
0
 private function format_typeStatus($value)
 {
     $value = trim(Functions::remove_whitespace($value));
     if (is_numeric(stripos($value, " ")) || is_numeric(stripos($value, "/"))) {
         $measurement_remarks = $value;
     } else {
         $measurement_remarks = "";
     }
     $value = trim(strtoupper($value));
     $value = str_ireplace(array("[", "]", "!"), "", $value);
     $value = str_ireplace(" ?", "?", $value);
     $value = str_ireplace("TYPES", "TYPE", $value);
     $value = str_ireplace("PROBABLE", "POSSIBLE", $value);
     $value = str_ireplace("NEOTYPE COLLECTION", "NEOTYPE", $value);
     $value = str_ireplace("TYPE.", "TYPE", $value);
     if (substr($value, 0, 8) == "TYPE OF ") {
         $value = "TYPE";
     } elseif (substr($value, 0, 11) == "SYNTYPE OF ") {
         $value = "SYNTYPE";
     } elseif (substr($value, 0, 17) == "SCHIZOSYNTYPE OF ") {
         $value = "SCHIZOSYNTYPE";
     } elseif (substr($value, 0, 18) == "SCHIZOPARATYPE OF ") {
         $value = "SCHIZOPARATYPE";
     } elseif (in_array($value, array("TYPE, NO. 15 = LECTOTYPE", "TYPE, NO.17 = LECTOTYPE", "LECTOTYPE/TYPE", "LECTOTYPE, TYPE"))) {
         $value = "LECTOTYPE";
     } elseif (in_array($value, array("SYNTYTPE", "SYTNTYPE", "SYNYPES", "SYNTPE", "SYNTYPE MAMILLATA"))) {
         $value = "SYNTYPE";
     } elseif (in_array($value, array("PARALECTO", "PARALECTOYPES", "PARALECTOYPE"))) {
         $value = "PARALECTOTYPE";
     } elseif (in_array($value, array("SYNTYPE OR HOLOTYPE?", "?HOLOTYPE OR SYNTYPE", "HOLOTYPE OR SYNTYPE", "SYNTYPE OR HOLOTYPE"))) {
         $value = "SYNTYPE? + HOLOTYPE?";
     } elseif (in_array($value, array("POSS./PROB. PARALECTOTYPE", "PARALECTOTYPE (POSSIBLE)", "POSSIBLE PARALECTOTYPE", "?PARALECTOTYPE"))) {
         $value = "PARALECTOTYPE?";
     } elseif (in_array($value, array("PT OF HOLOTYPE", "PART OF HOLOTYPE", "HOLOTYPE (PART)"))) {
         $value = "HOLOTYPE FRAGMENT";
     } elseif (in_array($value, array("PART OF TYPE", "PT OF TYPE", "PART OF TYPE MATERIAL", "PT OF TYPE MATERIAL", "TYPE (PART)"))) {
         $value = "TYPE FRAGMENT";
     } elseif (in_array($value, array("?PT OF TYPE?", "?PT OF TYPE OF REGULARIS?"))) {
         $value = "UNCONFIRMED TYPE";
     } elseif (in_array($value, array("TYPE - HOLOTYPE", "HOLOTYPE LIMNOTRAGUS SELOUSI", "COTYPE (HOLOTYPE", "HOLOTYPE, TYPE"))) {
         $value = "HOLOTYPE";
     } elseif (in_array($value, array("SYNYTPE", "FIGURED SYNTYPE"))) {
         $value = "SYNTYPE";
     } elseif (in_array($value, array("TOPTYPE", "TOPOTYPICAL"))) {
         $value = "TOPOTYPE";
     } elseif (in_array($value, array("COTYPUS", "CO-TYPE"))) {
         $value = "COTYPE";
     } elseif ($value == "POSSIBLE COTYPE (FIDE M. R. BROWNING)") {
         $value = "POSSIBLE COTYPE";
     } elseif ($value == "SYNTYPE OR PARALECTOTYPE") {
         $value = "SYNTYPE? + PARALECTOTYPE?";
     } elseif ($value == "SYNTYPE OR LECTOTYPE") {
         $value = "SYNTYPE? + LECTOTYPE?";
     } elseif ($value == "TOPOTYPE (STATED BY THE DONOR TO BE PARATYPE)") {
         $value = "TOPOTYPE? + PARATYPE?";
     } elseif ($value == "HOLOTYPE/PARATYPE?") {
         $value = "HOLOTYPE + PARATYPE?";
     } elseif ($value == "HOLOTYPE/SYNTYPE") {
         $value = "HOLOTYPE + SYNTYPE";
     } elseif ($value == "SYNTYPE/HOLOTYPE") {
         $value = "HOLOTYPE + SYNTYPE";
     } elseif ($value == "HOLOTYPE/LECTOTYPE") {
         $value = "HOLOTYPE + LECTOTYPE";
     } elseif ($value == "NEOTYPE (POSSIBLE)") {
         $value = "NEOTYPE?";
     } elseif ($value == "LECTOTYPE (POSSIBLE)") {
         $value = "LECTOTYPE?";
     } elseif ($value == "ALLOTYPE (POSSIBLE)") {
         $value = "ALLOTYPE?";
     } elseif ($value == "ORIGINAL MATERIAL.") {
         $value = "ORIGINALMATERIAL";
     } elseif ($value == "PART OF LECTOTYPE") {
         $value = "LECTOTYPE FRAGMENT";
     } elseif ($value == "PART OF PARATYPE") {
         $value = "PARATYPE FRAGMENT";
     } elseif ($value == "ISTOTYPE") {
         $value = "ISOTYPE";
     } elseif ($value == "PARATYPE (ALLOTYPE)") {
         $value = "ALLOTYPE";
     } elseif (in_array($value, array("PARATYPE #5", "PARATYPE V", "PARATYPE I", "PARATYPE II", "PARATYPE #2", "PARATYPE #3", "PARATYPE (NO.52)", "PARATYPE #1", "PARATYPE #9", "PARATYPE III", "PARATYPE II AND III", "PARATYPE III AND IV", "PARATYPE #10", "PARATYPE #7", "PARATYPE #4", "PARATYPE #6", "PARATYPE (NO.65)", "PARATYPE #8", "PARAYPE", "PARATYPE)"))) {
         $value = "PARATYPE";
     }
     return array("type_status" => $value, "measurement_remarks" => $measurement_remarks);
 }
 private function assign_reference($html)
 {
     if (preg_match("/references:(.*?)<hr/ims", $html, $arr)) {
         $html = $arr[1];
     }
     // process scientific articles
     $scientific_articles = array();
     if (preg_match("/scientific articles:(.*?)xxx/ims", $html . "xxx", $arr)) {
         $html2 = str_replace(array("\n", "&nbsp;"), " ", $arr[1]);
         $html2 = str_ireplace(array("Dichotomous Keys:", "General References:", "Scientific Articles:", "Web sites:"), "", $html2);
         $html2 = Functions::remove_whitespace($html2);
         if (preg_match_all("/<p>(.*?)<\\/p>/ims", $html2, $arr)) {
             $temp = array_map('strip_tags', $arr[1]);
             $temp = array_map('trim', $temp);
             $temp = array_filter($temp);
             //remove null arrays
             $temp = array_values($temp);
             //reindex key
             if ($temp) {
                 $scientific_articles = $temp;
             }
         }
     }
     // end scientific articles
     //1st option: the one with #, e.g. Cancer_gracilis.html
     $option1 = array();
     if (preg_match_all("/html\\#(.*?)\"/ims", $html, $arr)) {
         $option1 = array_map('urldecode', $arr[1]);
     }
     if ($option1) {
         return array_merge($option1, $scientific_articles);
     }
     //2nd option: the one without hyperlinks, e.g. Paranemertes_peregrina.html
     $temp = explode("<br>&nbsp;", $html);
     $temp = array_map('strip_tags', $temp);
     $final = array();
     foreach ($temp as $t) {
         $t = explode("\n", $t);
         $t = trim($t[0]);
         $final[$t] = '';
     }
     $option2 = array_keys($final);
     $option2 = array_filter($option2);
     //remove null values
     if ($option2) {
         return array_merge($option2, $scientific_articles);
     }
     //3rd option: the one with actual reference body, e.g. Neognathophausia_ingens.html
     $temp = explode("<p>", $html);
     $temp = array_map('strip_tags', $temp);
     $final = array();
     foreach ($temp as $t) {
         $t = self::clean_string($t);
         $t = str_ireplace(array("Dichotomous Keys:", "General References:", "Scientific Articles:", "Web sites:"), "", $t);
         $final[] = trim($t);
     }
     $final = array_map('trim', $final);
     $option3 = array_filter($final);
     //remove null values
     if ($option3) {
         $temp = array_merge($option3, $scientific_articles);
         $temp = array_filter($temp);
         //remove null arrays
         $temp = array_unique($temp);
         //make unique
         $temp = array_values($temp);
         //reindex key
         return $temp;
     }
     return array();
 }
 private function get_taxon_descriptions_from_LD_taxon_pages($params)
 {
     // for biblio spreadsheet
     $biblios = self::get_row_from_spreadsheet(@$params['scratchpad_biblio'], "Title");
     // $headers = self::get_column_headers($this->file_importer_xls["text"]);
     $headers = $this->lifedesk_fields["text"];
     // for stats
     $parts = pathinfo($this->text_path["eol_xml"]);
     $dump_file = $parts["dirname"] . "/images_not_in_xls2.txt";
     $options = $this->download_options;
     $options['expire_seconds'] = $this->LD_nodes_pages_expire_seconds;
     // lookup to LifeDesk page should not expire unless requested to have a fresh export to scratchpad
     $options['download_wait_time'] = 2000000;
     $topics = array();
     $records = array();
     //start accessing individual taxon page in LD
     if ($pages = self::get_nodes_or_pages("taxa", $params, $options)) {
         $total = count($pages);
         $i = 0;
         foreach ($pages as $page) {
             $i++;
             echo "\n{$i} of {$total} --- page: [{$page}]";
             $sciname = false;
             // <h3 class="taxonpage">Distribution</h3>
             if ($html = Functions::lookup_with_cache("http://" . $params["name"] . ".lifedesks.org/pages/{$page}", $options)) {
                 /* getting just topics -- working
                    if(preg_match_all("/<h3 class=\"taxonpage\">(.*?)<\/h3>/ims", $html, $arr))
                    {
                        $topics = array_merge($topics, $arr[1]);
                        $topics = array_unique($topics);
                    }
                    */
                 // /*
                 $rec = array();
                 $html = str_ireplace('<div class="taxonpage-children">', '<div class="sub-chapter"><div class="taxonpage-children">', $html);
                 if (preg_match_all("/<h3 class=\"taxonpage\">(.*?)<div class=\"sub-chapter\">/ims", $html, $arr)) {
                     $sections = $arr[1];
                     foreach ($sections as $section) {
                         $str = strip_tags($section, "<p><em><h3>");
                         $str = str_ireplace(array('Comment (0)', "\n"), '', $str);
                         $str = Functions::remove_whitespace($str);
                         $parts = explode("</h3>", $str);
                         $parts = array_map('trim', $parts);
                         $rec[$parts[0]] = $parts[1];
                     }
                 }
                 // else echo "\nno articles\n"; working...
                 if (preg_match("/<h1 class=\"taxonpage\">(.*?)<\\/h1>/ims", $html, $arr)) {
                     $sciname = trim(strip_tags($arr[1]));
                     $records[$sciname]["articles"] = $rec;
                     // assigning objects to sciname
                     $records[$sciname]["page"] = $page;
                     // assigning page to sciname
                 }
                 // */
                 // /*
                 // DATA-1552
                 if (@$params['scratchpad_biblio'] && $sciname) {
                     // if(preg_match("/<h2 class=\"taxonpage\">References<\/h2>(.*?)title=\"About this site\">About this site<\/a>/ims", $html, $arr))
                     if (true) {
                         // $html = $arr[1];
                         foreach ($biblios as $biblio) {
                             if (is_numeric(stripos($html, $biblio)) || is_numeric(stripos(strip_tags($html), $biblio)) || is_numeric(stripos($html, strip_tags($biblio))) || is_numeric(stripos(strip_tags($html), strip_tags($biblio)))) {
                                 $this->biblio_taxa[$biblio][] = $sciname;
                                 echo "\nwith biblio taxa 1\n";
                             } else {
                                 $html = str_ireplace(array("\n"), "", $html);
                                 if (is_numeric(stripos($html, $biblio)) || is_numeric(stripos(strip_tags($html), $biblio)) || is_numeric(stripos($html, strip_tags($biblio))) || is_numeric(stripos(strip_tags($html), strip_tags($biblio)))) {
                                     $this->biblio_taxa[$biblio][] = $sciname;
                                     echo "\nwith biblio taxa 2\n";
                                 }
                             }
                         }
                     }
                 }
                 // */
                 // /*
                 // DATA-1554
                 if (@$params['scratchpad_taxonomy'] && $sciname) {
                     if (preg_match_all("/biblio\\/view\\/(.*?)\"/ims", $html, $arr)) {
                         if ($val = @$this->taxonomy_biblio[$sciname]) {
                             $this->taxonomy_biblio[$sciname] = array_merge($val, $arr[1]);
                         } else {
                             $this->taxonomy_biblio[$sciname] = $arr[1];
                         }
                     }
                 }
                 // */
             }
         }
         //foreach page
     }
     $topics = array_unique($topics);
     if ($records) {
         self::save_taxon_articles_to_text($records, $headers);
     }
 }
예제 #7
0
 private function parse_href_string($str)
 {
     //&nbsp;<a href="taxa/scaritinae">Scaritinae Bonelli, 1810
     $rek = array();
     if (preg_match("/href=\"(.*?)\"/ims", $str, $arr)) {
         $rek['href'] = $arr[1];
     }
     if (preg_match("/>(.*?)xxx/ims", $str . 'xxx', $arr)) {
         $rek['name'] = Functions::remove_whitespace($arr[1]);
     }
     return $rek;
 }
예제 #8
0
 private function prepare_texts($taxa)
 {
     $k = 0;
     foreach ($taxa as $taxon) {
         $k++;
         if ($k % 10 == 0) {
             echo "\n {$k} - ";
         }
         $taxon = array_map('trim', $taxon);
         // if($taxon['sciname'] == "Liomys salvini") continue; //not insect, will process independently
         // $taxon['href'] = "/paginas-de-especies/insectos/104-nymphalidae/670-i-caligo-illioneus-i-nymphalidae"; //debug
         //create taxon
         $taxon['sciname'] = self::clean_name($taxon['sciname']);
         $taxon['taxon_id'] = strtolower(str_replace(" ", "_", $taxon['sciname']));
         $t = new \eol_schema\Taxon();
         $t->taxonID = $taxon['taxon_id'];
         $t->scientificName = $taxon['sciname'];
         $t->family = $taxon['family'];
         if (!@$taxon['family']) {
             print_r($taxon);
             exit;
         }
         $t->furtherInformationURL = $this->acg_domain . $taxon['href'];
         if (!isset($this->taxa_ids[$t->taxonID])) {
             $this->taxa_ids[$t->taxonID] = '';
             $this->archive_builder->write_object_to_file($t);
         }
         // start prepare objects - image, text
         if ($html = Functions::lookup_with_cache($this->acg_domain . $taxon['href'], $this->download_options)) {
             $html = str_ireplace(array(" ", "&nbsp;", ' colspan="2"', ' rowspan="1"', ' style="margin-bottom: 0in;"', ' lang="es-CR"', ' style="text-align: center;"', ' style="line-height: 1.3em;"'), "", $html);
             $html = str_ireplace("Fig..", "Fig.", $html);
             if (preg_match("/<h1 class=\"titulo-articulo nombreCientifico\">(.*?)<a class=\"top\" href=\"#arriba\">/ims", $html, $match)) {
                 $str = strip_tags($match[1], "<p><td><tr><table><i><img>");
                 $str = str_ireplace(' min-height: 14px;', '', $str);
                 $str = str_ireplace(' style="margin: 0px; line-height: normal; font-family: Helvetica;"', '', $str);
                 // e.g. $i == 10
                 $str = str_ireplace(' class="p1"', '', $str);
                 $str = str_ireplace(' style="text-align: justify;"', '', $str);
                 //e.g. Aellopos ceculus (Sphingidae)
                 $str = str_ireplace(' style="font: normal normal normal 12px/normal Helvetica; margin: 0px;"', '', $str);
                 //e.g. Calydna sturnula (Riodinidae)
                 $str = str_ireplace(" 'Helvetica Neue'; margin: 0px;", '', $str);
                 $str = str_ireplace(' style="font: normal normal normal 12px/normal"', '', $str);
                 $str = str_ireplace(' style="font: normal normal normal 18px/normal"', '', $str);
                 if (preg_match_all("/<p>(.*?)<\\/p>/ims", $str, $match2)) {
                     $total_txt = $match2[1];
                     $i = 0;
                     foreach ($total_txt as $r) {
                         if (is_numeric(stripos($r, "src="))) {
                             $total_txt[$i] = "";
                         }
                         $i++;
                     }
                     if (count($total_txt) < 4) {
                         echo "\ninvestigate txt: {$this->acg_domain}" . "{$taxon['href']} \n";
                     }
                     $total_txt = array_map('trim', $total_txt);
                     $total_txt = array_filter($total_txt);
                     $total_txt = array_values($total_txt);
                     //remove <p> in each array value
                     $i = 0;
                     foreach ($total_txt as $t) {
                         $total_txt[$i] = str_ireplace("<p>", "", $t);
                         $i++;
                     }
                 }
                 $total_img = array();
                 if (preg_match_all("/<td><img(.*?)<\\/td>/ims", $str, $match2)) {
                     $total_img = $match2[1];
                 }
                 if (preg_match_all("/<p><img(.*?)<\\/p>/ims", $str, $match2)) {
                     $total_img = array_merge($total_img, $match2[1]);
                 }
                 if (count($total_img) < 4) {
                     echo "\ninvestigate img: {$this->acg_domain}" . "{$taxon['href']} \n";
                 }
                 $caption_src = array();
                 $final_images = array();
                 foreach ($total_img as $img) {
                     $src = false;
                     if (preg_match("/src=\"(.*?)\"/ims", $img, $match2)) {
                         $src = $this->acg_domain . trim(str_ireplace("miniaturas/peq_", "", $match2[1]));
                     }
                     if (preg_match("/\\/>(.*?)\\(Click en la imágen para expandir\\)/ims", $img, $match2)) {
                         $caption = Functions::remove_whitespace($match2[1]);
                     }
                     // manual adjustment
                     $caption = self::adjust_caption($caption);
                     if (is_numeric(stripos($src, "http://www.acguanacaste.ac.crdata:image/png"))) {
                         continue;
                     }
                     if ($src) {
                         $final_images[] = array("src" => $src, "caption" => $caption);
                         // assign Figure # with src
                         if (preg_match("/Figura (.*?)\\./ims", $caption, $match2)) {
                             $caption_src[self::get_number_only($match2[1])] = $src;
                         } elseif (preg_match("/Fig.(.*?)\\./ims", $caption, $match2)) {
                             $caption_src[self::get_number_only($match2[1])] = $src;
                         } elseif (preg_match("/Fig(.*?)\\./ims", $caption, $match2)) {
                             $caption_src[self::get_number_only($match2[1])] = $src;
                         } elseif (preg_match("/figura (.*?)\\./ims", $caption, $match2)) {
                             $caption_src[self::get_number_only($match2[1])] = $src;
                         }
                         /* debug
                            if($src == "http://www.acguanacaste.ac.cr/images/species-home-page/Xylophanes-porcus/12-SRNP-43295-DHJ707153.JPG")
                            {
                                echo "\n" . self::get_number_only($match2[1]);
                                echo "\n $caption";
                                echo "\n $src"; exit;
                            }
                            */
                     }
                 }
                 if ($final_images) {
                     self::create_image_text_objects($final_images, $taxon, "image");
                 }
                 // start adding image links at the end of text objects
                 $i = 0;
                 foreach ($total_txt as $txt) {
                     $total_txt[$i] = self::add_image_links($txt, $caption_src);
                     $i++;
                 }
                 if ($total_txt) {
                     $total_txt = array_map('trim', $total_txt);
                     $desc = Functions::remove_whitespace(implode("<p>", $total_txt));
                     self::create_image_text_objects(array(0 => $desc), $taxon, "text");
                 }
             } else {
                 echo "\ninvestigate: no articulo sphomep \n";
             }
         } else {
             echo "\ninvestigate: page not found \n";
         }
     }
 }
예제 #9
0
 private function clean_string($string, $type = false)
 {
     $string = str_ireplace(array("\r\n", "\n", "\r", "\t", "", "\v", "\t"), '', $string);
     if ($type == "name") {
         $string = strip_tags($string);
     }
     return trim(Functions::remove_whitespace($string));
 }
예제 #10
0
 private function format_wiki_substr($substr)
 {
     return str_replace(array("\n", "\t"), "", Functions::remove_whitespace($substr));
 }
예제 #11
0
 private function process_urls($urls)
 {
     $i = 0;
     $total = count($urls);
     foreach ($urls as $url) {
         $i++;
         echo "\n - {$i} of {$total} [{$url}]\n";
         if (isset($this->stored_offline_urls[$url])) {
             continue;
         }
         /* breakdown when caching debug
            $cont = false;
            // if($i >= 1 && $i < 1000)     $cont = true;
            // if($i >= 1000 && $i < 2000)  $cont = true;
            // if($i >= 2000 && $i < 3000)  $cont = true;
            // if($i >= 3000 && $i < 4000)  $cont = true;
            if(!$cont) continue;
            */
         if ($html = Functions::lookup_with_cache($url, $this->download_options)) {
             $html = trim(str_ireplace(array(' align="center"', ' class="style1"', ' class="style2"'), "", $html));
             if (preg_match("/>Family: (.*?)xxx/ims", $html . "xxx", $arr)) {
                 $rec["source"] = $url;
                 if (preg_match("/<h1>(.*?)<\\/h1>/ims", $html, $arr) || preg_match("/<FONT FACE=\"Arial\">(.*?)<\\/FONT>/ims", $html, $arr) || preg_match("/<h2>(.*?)<\\/h2>/ims", $html, $arr)) {
                     $sciname = Functions::remove_whitespace(strip_tags($arr[1]));
                     $sciname = trim(str_replace(array(chr(13), chr(10)), " ", $sciname));
                     //manual adjustments
                     if ($sciname == "Small Winter Stoneflies") {
                         $sciname = "Allocapnia sp.";
                     }
                     if ($sciname == "Embioptera: Family Oligotomidae") {
                         $sciname = "Oligotomidae";
                     }
                     $sciname = trim($sciname);
                     $to_exclude = array("cf.", "sp.", "Unidentified Stonefly", "Family");
                     $include = true;
                     foreach ($to_exclude as $exclude) {
                         if (is_numeric(stripos($sciname, $exclude))) {
                             $include = false;
                         }
                     }
                     if (!$include) {
                         continue;
                     }
                     $sciname = Functions::canonical_form($sciname);
                     // only species-level
                     if (stripos($sciname, " ") === false) {
                         continue;
                     }
                     // $images = self::parse_images($html, $url); working... temporarily commented
                     $images = array();
                     $info = self::parse_texts($html, $url);
                     $lengths = @$info["lengths"];
                     $wingspan = @$info["wingspan"];
                     if ($images || $lengths) {
                         $r = array();
                         $r["sciname"] = $sciname;
                         $r["taxon_id"] = str_replace(" ", "_", $r["sciname"]);
                         $r["source"] = $url;
                         self::create_instances_from_taxon_object($r);
                         $r["images"] = $images;
                         $r["lengths"] = $lengths;
                         $r["wingspan"] = $wingspan;
                         if ($lengths) {
                             self::prepare_length_structured_data($r);
                         }
                         if ($images) {
                             self::prepare_image_objects($r);
                         }
                     }
                 } else {
                     echo "\n investigate: no sciname [{$url}]";
                 }
             }
         } else {
             self::save_to_dump($url, $this->current_offline_urls_dump_file);
         }
     }
     // print_r($this->debug);
     print "\n count:" . count($this->debug) . "\n";
 }
예제 #12
0
 private function parse_distribution_map($str, $html, $t, $group)
 {
     if (preg_match_all("/{$str}(.*?)<\\/li>/ims", $html, $arr)) {
         // echo "\n";echo $html;echo "\n"; exit;
         foreach ($arr[1] as $line) {
             if (strpos($line, "MapGen") === false) {
                 continue;
             }
             // only distribution maps will be processed
             $rec = array();
             $rec["taxon_id"] = (string) $t["ID"];
             $rec["media_url"] = "";
             if (preg_match("/href=\"(.*?)\"/ims", $line, $arr2)) {
                 $rec["source_url"] = $arr2[1];
             }
             if (preg_match("/src=\"(.*?)\"/ims", $line, $arr2)) {
                 $rec["media_url"] = str_ireplace("Width=120", "Width=960", $arr2[1]);
             }
             if (preg_match("/<\\/a>(.*?)xxx/ims", $line . "xxx", $arr2)) {
                 $rec["description"] = "Note: Distribution maps are often incomplete due to the workload of entering data.<br>" . trim($arr2[1]) . "<br>";
                 $rec["description"] .= self::map_legend();
                 $rec["description"] = trim(str_ireplace(array(chr(9), chr(10), chr(13)), "", $rec["description"]));
                 $rec["description"] = Functions::remove_whitespace($rec["description"]);
             }
             $rec["media_id"] = "map_" . $rec["taxon_id"];
             $rec["title"] = "Distribution for " . $t["Rank"] . " " . self::get_scientific_name($t);
             $rec = array_map('trim', $rec);
             $rec["group"] = $group;
             $rec["subtype"] = "Map";
             $rec["citation"] = self::parse_citation($html);
             $rec["CreativeCommons"] = "BY-SA";
             //distribution maps is be default BY-SA, until discovered
             if ($rec["media_url"]) {
                 self::create_data_object($rec);
             } else {
                 echo "\n investigate no map";
                 echo "\n {$str}";
                 echo "\n {$html}";
                 echo "\n [" . $rec["taxon_id"] . "] [{$group}]\n";
             }
         }
     }
 }