private function process_html($html_path) { // <i><A href="362a.htm">Gryllotalpa cultriger</a></i> if ($html = Functions::get_remote_file($html_path, array('timeout' => 999999, 'download_attempts' => 2, 'delay_in_minutes' => 2))) { if (preg_match_all("/<i><A href=\"(.*?)\"/ims", $html, $arr)) { foreach ($arr[1] as $string) { $string = str_ireplace("a.htm", "m.htm", $string); // $string = "302m.htm"; //123m.htm 071m.htm 318m.htm //debug $url = $this->sina_domain . $string; $urls = array(); $urls[] = $url; $id = intval($string); echo "\n id: [{$id}]"; if (isset($this->additional_maps[$id])) { foreach ($this->additional_maps[$id] as $url) { $urls[] = $url; } } foreach ($urls as $url) { if ($rec = self::get_map_data($url)) { $parts = pathinfo(@$rec["map"]); $rec["taxon_id"] = intval($parts["filename"]); if (!$rec["taxon_id"]) { echo "\n investigate blank taxon_id [{$url}]\n"; continue; } $rec["source_url"] = $this->sina_domain . Functions::format_number_with_leading_zeros($rec["taxon_id"], 3) . "a.htm"; $this->create_instances_from_taxon_object($rec, array()); $ref_ids = array(); $agent_ids = array(); $rec["caption"] = "Version of manually-generated dot map displayed above, showing U.S. and Canadian records, was harvested from SINA on " . date("M-d-Y") . ".<br><br>" . @$rec["caption"]; if (@$rec["map"]) { self::get_images($rec["sciname"], @$rec["caption"], $rec["taxon_id"], $parts["filename"], $rec["map"], $rec["source_url"], $ref_ids, $agent_ids); } if (@$rec["computer_gen_map"]) { $parts = pathinfo($rec["computer_gen_map"]); $ref_ids = array(); $agent_ids = array(); $caption = $rec["as_of"]; if ($rec["link_back"]) { $caption .= "<br><br>" . 'See also this <a href="' . $rec["link_back"] . '">manually generated dot map</a> showing U.S. and Canadian records, with shaded area showing likely general distribution.'; } self::get_images($rec["sciname"], $caption, $rec["taxon_id"], $parts["filename"], $rec["computer_gen_map"], $rec["source_url"], $ref_ids, $agent_ids); } } // break; //debug } // break; //debug } } else { echo "\n investigate 01 [{$html_path}]"; } } else { echo "\n investigate 02 [{$html_path}]"; } }
function utility_append_text_loop() { echo "\n backing up first..."; $filename = DOC_ROOT . "/public/tmp/mycobank/mycobank_dump.txt"; copy($filename, DOC_ROOT . "/public/tmp/mycobank/mycobank_dump_backup.txt"); echo "\n backup done. \n"; for ($x = 1; $x <= 1; $x++) { $str = Functions::format_number_with_leading_zeros($x, "2"); $filename = DOC_ROOT . "/public/tmp/mycobank/mycobank_dump_add" . $str . ".txt"; if (!($READ = Functions::file_open($filename, "r"))) { return; } $contents = fread($READ, filesize($filename)); fclose($READ); echo "\n copying... {$filename}"; $filename = DOC_ROOT . "/public/tmp/mycobank/mycobank_dump.txt"; echo "\n to... {$filename}\n"; if (!($WRITE = Functions::file_open($filename, "a"))) { return; } fwrite($WRITE, $contents); fclose($WRITE); } }
private function count_taxa_per_phylum($arr_phylum) { $total_phylum = sizeof($arr_phylum); $p = 0; $records = array(); $file_count = 0; foreach ($arr_phylum as $phylum) { $p++; $phylum_path = PHYLUM_SERVICE_URL . $phylum['name']; // $phylum_path = "http://localhost/eol_php_code/update_resources/connectors/files/BOLD/Annelida.xml"; // debug echo "\n\nphylum service: " . $phylum_path . "\n"; $response = Functions::lookup_with_cache($phylum_path, $this->download_options); if ($xml = simplexml_load_string($response)) { echo "\n [{$p} of {$total_phylum}] {$phylum['name']} {$phylum['id']} -- [" . sizeof($xml->record) . "]"; $i = 0; foreach ($xml->record as $rec) { $i++; $records[] = $rec; if (sizeof($records) >= 8000) { $file_count++; self::save_to_json_file($records, $this->TEMP_FILE_PATH . "sl_batch_" . Functions::format_number_with_leading_zeros($file_count, 3) . ".txt"); $records = array(); } // if($i >= 20) break; //debug } } else { echo "\n\n Cannot access: " . $phylum_path; self::log_cannot_access_phylum($phylum_path); } sleep(10); } //last save if ($records) { $file_count++; self::save_to_json_file($records, $this->TEMP_FILE_PATH . "sl_batch_" . Functions::format_number_with_leading_zeros($file_count, 3) . ".txt"); } //create work_list $str = ""; for ($i = 1; $i <= $file_count; $i++) { $str .= "sl_batch_" . Functions::format_number_with_leading_zeros($i, 3) . "\n"; } if ($fp = fopen($this->WORK_LIST, "w")) { fwrite($fp, $str); fclose($fp); } }
private function divide_text_file($divisor) { $temp_filepath = Functions::save_remote_file_to_local(self::DL_MAP_SPECIES_LIST, array('timeout' => 4800, 'download_attempts' => 5)); if (!$temp_filepath) { echo "\n\nExternal file not available. Program will terminate.\n"; return; } $i = 0; $file_ctr = 0; $str = ""; print "\n"; foreach (new FileIterator($temp_filepath, true) as $line_number => $line) { if ($line) { $line .= "\n"; // FileIterator removes the carriage-return char $i++; $str .= $line; print "{$i}. {$line}\n"; if ($i == $divisor) { print "\n"; $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 3); if (!($OUT = Functions::file_open($this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt", "w"))) { return; } fwrite($OUT, $str); fclose($OUT); $str = ""; $i = 0; } } //if($i >= 5) break; //debug } //last writes if ($str) { $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 3); if (!($OUT = Functions::file_open($this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt", "w"))) { return; } fwrite($OUT, $str); fclose($OUT); } //create work_list $str = ""; for ($i = 1; $i <= $file_ctr; $i++) { $str .= "batch_" . Functions::format_number_with_leading_zeros($i, 3) . "\n"; } $filename = $this->TEMP_FILE_PATH . "work_list.txt"; if ($fp = Functions::file_open($filename, "w")) { fwrite($fp, $str); fclose($fp); } }
private function get_date_ranges($start_year, $month = NULL) { $range = array(); if (!$month) { $current_year = date("Y"); for ($year = $start_year; $year <= $current_year; $year++) { if ($year == $current_year) { $month_limit = date("n"); } else { $month_limit = 12; } for ($month = 1; $month <= $month_limit; $month++) { $start_date = $year . "-" . Functions::format_number_with_leading_zeros($month, 2) . "-01"; $end_date = $year . "-" . Functions::format_number_with_leading_zeros($month, 2) . "-31"; $range[] = self::get_timestamp_range($start_date, $end_date); } } } else { $month = Functions::format_number_with_leading_zeros($month, 2); for ($day = 1; $day <= 30; $day++) { $start_date = $start_year . "-" . $month . "-" . Functions::format_number_with_leading_zeros($day, 2); $end_date = $start_year . "-" . $month . "-" . Functions::format_number_with_leading_zeros($day + 1, 2); $range[] = self::get_timestamp_range($start_date, $end_date); } if ($month == "12") { $next_year = $start_year + 1; $next_month = "01"; } else { $next_year = $start_year; $next_month = Functions::format_number_with_leading_zeros(intval($month) + 1, 2); } $start_date = $start_year . "-" . $month . "-31"; $end_date = $next_year . "-" . $next_month . "-01"; $range[] = self::get_timestamp_range($start_date, $end_date); } return $range; }
function divide_text_file($divisor) { $i = 0; $file_ctr = 0; $str = ""; foreach (new FileIterator(TROPICOS_NAME_EXPORT_FILE) as $line_number => $line) { if ($line) { $line .= "\n"; // FileIterator removes the carriage-return $i++; $str .= $line; echo "\n {$i}. {$line}"; if ($i == $divisor) { $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 2); if (!($OUT = fopen($this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt", "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt"); return; } fwrite($OUT, $str); fclose($OUT); $str = ""; $i = 0; } } } //last writes if ($str) { $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 2); if (!($OUT = fopen($this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt", "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $this->TEMP_FILE_PATH . "batch_" . $file_ctr_str . ".txt"); return; } fwrite($OUT, $str); fclose($OUT); } //create TROPICOS_work_list $str = ""; for ($i = 1; $i <= $file_ctr; $i++) { $str .= "batch_" . Functions::format_number_with_leading_zeros($i, 2) . "\n"; } $filename = $this->WORK_LIST; if ($OUT = fopen($filename, "w+")) { fwrite($OUT, $str); fclose($OUT); } }
function combine_all_xmls($resource_id) { debug("\n\n Start compiling all XML..."); $old_resource_path = CONTENT_RESOURCE_LOCAL_PATH . $resource_id . ".xml"; if (!($OUT = fopen($old_resource_path, "w"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $old_resource_path); return; } $str = "<?xml version='1.0' encoding='utf-8' ?>\n"; $str .= "<response\n"; $str .= " xmlns='http://www.eol.org/transfer/content/0.3'\n"; $str .= " xmlns:xsd='http://www.w3.org/2001/XMLSchema'\n"; $str .= " xmlns:dc='http://purl.org/dc/elements/1.1/'\n"; $str .= " xmlns:dcterms='http://purl.org/dc/terms/'\n"; $str .= " xmlns:geo='http://www.w3.org/2003/01/geo/wgs84_pos#'\n"; $str .= " xmlns:dwc='http://rs.tdwg.org/dwc/dwcore/'\n"; $str .= " xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'\n"; $str .= " xsi:schemaLocation='http://www.eol.org/transfer/content/0.3 http://services.eol.org/schema/content_0_3.xsd'>\n"; fwrite($OUT, $str); $i = 0; while (true) { $i++; $i_str = Functions::format_number_with_leading_zeros($i, 3); $filename = $this->TEMP_FILE_PATH . "temp_worms_" . "batch_" . $i_str . ".xml"; if (!is_file($filename)) { echo " -end compiling XML's- "; break; } echo " {$i} "; if (!($READ = fopen($filename, "r"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $filename); return; } $contents = fread($READ, filesize($filename)); fclose($READ); if ($contents) { fwrite($OUT, $contents); } else { echo "\n no contents {$i}"; } } fwrite($OUT, "</response>"); fclose($OUT); echo "\n All XML compiled\n\n"; }
public function create_work_list_from_master_file($master_file, $divisor, $destination_folder, $filename_prefix, $work_list) { if (!($FILE = Functions::file_open($master_file, "r"))) { echo "\n File not found: \n {$master_file} \n Program will terminate.\n\n"; return false; } $i = 0; $file_ctr = 0; $str = ""; while (!feof($FILE)) { if ($line = fgets($FILE)) { $i++; $str .= $line; print "\n{$i}. {$line}"; if ($i == $divisor) { $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 3); if (!($OUT = Functions::file_open($destination_folder . $filename_prefix . $file_ctr_str . ".txt", "w"))) { return; } fwrite($OUT, $str); fclose($OUT); $str = ""; $i = 0; } } } //last writes if ($str) { $file_ctr++; $file_ctr_str = Functions::format_number_with_leading_zeros($file_ctr, 3); if (!($OUT = Functions::file_open($destination_folder . $filename_prefix . $file_ctr_str . ".txt", "w"))) { return; } fwrite($OUT, $str); fclose($OUT); } //create work_list $str = ""; for ($i = 1; $i <= $file_ctr; $i++) { $str .= $filename_prefix . Functions::format_number_with_leading_zeros($i, 3) . "\n"; } if ($fp = Functions::file_open($work_list, "w")) { fwrite($fp, $str); fclose($fp); } return $file_ctr; // total number of work tasks }
private function delete_temp_files($file_path, $file_extension) { $i = 0; while (true) { $i++; $i_str = Functions::format_number_with_leading_zeros($i, 3); $filename = $file_path . $i_str . "." . $file_extension; if (file_exists($filename)) { print "\n unlink: {$filename}"; unlink($filename); } else { return; } } }
public function get_all_taxa($resource_id) { $all_taxa = array(); $this->used_collection_ids = array(); $urls = array(FWP_SPECIES_DOC_PATH); // you can have multiple sources of the species.xls $taxa_arr = self::compile_taxa($urls); require_library('XLSParser'); $parser = new XLSParser(); $images = self::prepare_table($parser->convert_sheet_to_array(FWP_IMAGES_DOC_PATH), "multiple", "SId", "SId", "PictureId", "dbo_Picture_PictureNote", "PictureType", "IsLegal", "Location", "PicComments", "IsAvailable", "LifeStage", "CollectionName", "CollectionAcronym", "PictureSource", "Surname", "Firstname", "DisplayName", "FileName"); $comnames = self::prepare_table($parser->convert_sheet_to_array(FWP_COMNAMES_DOC_PATH), "multiple", "SId", "CommonName", "Language"); $synonyms = self::prepare_table($parser->convert_sheet_to_array(FWP_SYNONYMS_DOC_PATH), "multiple", "SId", "SynGenusSpecies", "SynStatus"); $i = 1; $total = sizeof($taxa_arr); $j = 0; foreach ($taxa_arr as $taxon_arr) { echo "\n {$i} of {$total} -- " . $taxon_arr['SId']; $i++; $taxon_id = $taxon_arr['SId']; $page_taxa = self::get_fishwise_taxa($taxon_arr, @$images[$taxon_id], @$comnames[$taxon_id], @$synonyms[$taxon_id]); $all_taxa = array_merge($all_taxa, $page_taxa); if ($i % 10000 == 0) { $j++; $xml = \SchemaDocument::get_taxon_xml($all_taxa); $j_str = Functions::format_number_with_leading_zeros($j, 3); $resource_path = DOC_ROOT . "/update_resources/connectors/files/FishWisePro/" . $j_str . ".xml"; if (!($OUT = fopen($resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path); return; } fwrite($OUT, $xml); fclose($OUT); $all_taxa = array(); } } if ($all_taxa) { $j++; $xml = \SchemaDocument::get_taxon_xml($all_taxa); $j_str = Functions::format_number_with_leading_zeros($j, 3); $resource_path = DOC_ROOT . "/update_resources/connectors/files/FishWisePro/" . $j_str . ".xml"; if (!($OUT = fopen($resource_path, "w+"))) { debug(__CLASS__ . ":" . __LINE__ . ": Couldn't open file: " . $resource_path); return; } fwrite($OUT, $xml); fclose($OUT); } Functions::combine_all_eol_resource_xmls($resource_id, DOC_ROOT . "/update_resources/connectors/files/FishWisePro/*.xml"); self::delete_files(DOC_ROOT . "/update_resources/connectors/files/FishWisePro/*.xml"); return; }
private function save_data_to_text($params = false, $search_service = false, $searches_per_dump = 1000) { $this->dump_no++; $partial_dump = str_replace("mycobank_dump.txt", "partial", $this->dump_file); $partial_dump .= "_" . Functions::format_number_with_leading_zeros($this->dump_no, 3) . ".txt"; if (!$params) { $params = self::get_params_for_webservice(); } $total_params = count($params); $i = 0; foreach ($params as $param) { $param = trim(ucfirst($param)); print "\n searching:[{$param}]"; $i++; if ($i % $searches_per_dump == 0) { $this->dump_no++; $partial_dump = str_replace("mycobank_dump.txt", "partial", $this->dump_file); $partial_dump .= "_" . Functions::format_number_with_leading_zeros($this->dump_no, 3) . ".txt"; } /* $cont = false; // if($i >= 1 && $i < 7000) $cont = true; // if($i >= 7000 && $i < 14000) $cont = true; // if($i >= 14000 && $i < 21000) $cont = true; if(!$cont) continue; */ if (in_array($param, $this->dont_search_more_than_5h)) { print "\n [{$param}] must not be searched... \n"; continue; } elseif (in_array($param, $this->dont_search_these_strings_as_well)) { print "\n [{$param}] must not be searched... \n"; continue; } $no_of_results = 0; if ($val = $search_service) { $url = $val . '"' . $param . '"'; } else { $url = $this->service_search["startswith"] . '"' . $param . '"'; } echo "\n[{$param}] {$i} of {$total_params} \n"; if ($contents = Functions::lookup_with_cache($url, $this->download_options)) { if ($response = simplexml_load_string($contents)) { if (isset($response->ErrorMessage)) { echo "\n investigate error [{$param}]: " . $response->ErrorMessage . "\n"; sleep(120); // 2mins echo "\n access failed [{$param}] ... \n"; self::save_to_dump($param, $this->names_with_error_dump_file); continue; } $no_of_results = count($response); if ($no_of_results > 0) { echo " - count: {$no_of_results}"; if ($no_of_results >= 500 && $no_of_results < 900) { self::save_to_dump($param . "\t" . $no_of_results, $this->more_than_5h); } if ($no_of_results >= 900) { self::save_to_dump($param . "\t" . $no_of_results, $this->more_than_1k); } $records = array(); foreach ($response as $rec) { $hierarchy = ""; $source_url = ""; $parent = ""; if (preg_match("/title\\='(.*?)'/ims", $rec->Classification_, $arr)) { $hierarchy = $arr[1]; $parent = self::get_parent_from_hierarchy($hierarchy); } $rec_id = ""; if (preg_match("/;Rec\\=(.*?)\\&/ims", $rec->Classification_, $arr)) { $rec_id = $arr[1]; } if (preg_match("/href\\='(.*?)'/ims", $rec->Classification_, $arr)) { $source_url = str_ireplace("&", "&", $arr[1]); } $records[] = array("n" => (string) $rec->Name, "cn" => (string) $rec->CurrentName_Pt_, "r" => (string) $rec->Rank_Pt_, "nt" => (string) $rec->NameType_, "ns" => (string) $rec->NameStatus_, "a" => (string) $rec->Authors_, "p" => $parent, "h" => $hierarchy, "s" => $source_url, "t" => (string) $rec->MycoBankNr_, "d" => (string) $rec_id, "y" => (string) $rec->NameYear_, "e3" => (string) $rec->E3787, "e4" => (string) $rec->E4060, "so" => (string) $rec->ObligateSynonyms_Pt_, "sf" => (string) $rec->FacultativeSynonyms_Pt_); } $temp = array(); $temp[$param] = $records; self::save_to_dump($temp, $partial_dump); } else { echo "\n no result for: [{$param}]\n"; /* decided not to save params with zero records anymore - 14Jul2014 // save even with no records, so it won't be searched again... $temp = array(); $temp[$param] = array(); self::save_to_dump($temp, $partial_dump); */ } } } else { echo "\n access failed [{$param}] ... \n"; self::save_to_dump($param, $this->names_with_error_dump_file); } self::sleep_now($no_of_results); } }