*/ $to = have_url($link, $crawl_tag); /** * If the link is not in the table, add it */ if (!$to) { /** * Output that we're adding a URL if we're in verbose mode */ if (isset($_GET['debug'])) { echo "<li>Adding url " . urldecode($link) . " to list</li>"; } /** * Add URL to table, grab link ID # */ $to = add_url($link, $clicks, $crawl_tag); } /** * If debug mode, indicate that we're adding a link */ if (isset($_GET['debug'])) { echo "<li>Adding link from here to " . urldecode($link) . "</li>"; } /** * Add the link to the links table */ add_link($id, $to); } /** * If the server did not report a size (in which case cURL returns '-1'), * use the size of the cURL as the file size, otherwise, trust the server
function generate() { global $upload_dir, $db, $current_user, $main_smarty, $the_template; // the file name that should be uploaded $file_tmp = $_FILES['upload_file']['tmp_name']; $file_name = $_FILES['upload_file']['name']; $unique_file_name = "tingtest1.ktr"; $upload_dir = get_misc_data('upload_directory'); $upload_path = mnmpath . $upload_dir . $unique_file_name; $upload = move_uploaded_file($file_tmp, $upload_path); /*create new ktr file*/ $tmpDir = 'excel-to-target_schema.ktr'; $newDir = '0.ktr'; copy($tmpDir, $newDir); $a1 = $_POST["sheet"]; $b1 = $_POST["row"]; $c1 = $_POST["col"]; $a = array($a1, "", ""); $b = array($b1, "", ""); $c = array($c1, "", ""); $sheets = array($a, $b, $c); $spd = $_POST["spd"]; $drd = $_POST["drd"]; $start = $_POST["start"]; $end = $_POST["end"]; $start2 = $_POST["start2"]; $end2 = $_POST["end2"]; $location = $_POST["location"]; $aggrtype = $_POST["aggrtype"]; $location2 = $_POST["location2"]; $aggrtype2 = $_POST["aggrtype2"]; $process = new Process_excel(); $arr_Sheet_name = $process->getSheetName('census.xls'); $arr_Header = $process->getHeader('dataverse_census.xls', 0, 11, 'A'); //print_r ($process->getHeader('tradestatistics.xls', 1, 23, 'A')); echo $start; /* adding url*/ add_url(0, 'http://colfusion.exp.sis.pitt.edu/colfusion/upload_raw_data/irule_dataverse_census.xls'); echo "hello"; //add_sheets(0, $sheets); addSheets(0, "Table HH-1", 10, 0); addConstants('Spd', $spd, 'Date', 'yyyyMMdd'); addConstants('Drd', $drd, 'Date', 'yyyyMMdd'); add_excel_input_fields($arr_Header); add_sample_target(); //$arr_Header - $array_no_need_normalize $no_need_Array = array($start, $end, $location, $aggrtype); //print_r ($no_need_Array); //print_r($arr_Header); $result = array_diff($arr_Header, $no_need_Array); // print_r($result); /*--------------------the second $result are from user , the first $result need to use AJAX to present to user------*/ add_normalizer($result, $result); //for variable of star if ($start != "") { //$start from excel update_target('Start', $start); } else { //$start from user input addConstants('Start_from_input', $start2, 'Date', 'yyyyMMdd'); update_target('Start', 'Start_from_input'); } //for variable of end if ($end != "") { //$start from excel update_target('End', $end); } else { //$start from user input addConstants('End_from_input', $end2, 'Date', 'yyyyMMdd'); update_target('End', 'End_from_input'); } //for variable of location if ($location != "") { echo $location; //$start from excel update_target('Location', $location); } else { //$start from user input echo $location2; addConstants('Location_from_input', $location2, 'String', ''); update_target('Location', 'Location_from_input'); } //for variable of aggrType if ($aggrtype != "") { //$start from excel update_target('AggrType', $aggrtype); } else { //$start from user input addConstants('AggrType_from_input', $aggrtype2, 'String', ''); update_target('AggrType', 'AggrType_from_input'); } //add_normalize($ArrayKey,$ArrayValue); echo $start; echo "........1<br/>"; echo $start2; echo ".......2<br/>"; echo $end; echo "..........3<br/>"; echo $end2; echo "...........4<br/>"; echo $location; echo "............5<br/>"; echo $location2; echo "..........6<br/>"; echo $aggrtype; echo "..........7<br/>"; echo $aggrtype2; echo "............8<br/>"; echo $spd; echo ".........9<br/>"; echo $drd; echo "..........10<br/>"; }
print '<?xml version="1.0" encoding="UTF-8"?>'; ?> <urlset xmlns="http://www.google.com/schemas/sitemap/0.84"> <?php include "config.php"; # we don't include posts.php and links.php here because they're disallowed by robots.txt anyway. add_url("index.php", "daily", "0.8"); add_url("blogs.php", "weekly", "0.1"); add_url("stats.php", "weekly", "0.3"); add_url("blog_search.php", "daily", "0.9"); add_url("search.php", "monthly", "0.5"); add_url("post.php", "yearly", "0.1"); add_url("paper.php", "monthly", "0.6"); add_url("papers.php", "daily", "0.1"); if ($config['do_wiki']) { add_url("wiki/doku.php", "weekly", "0.8"); } function add_url($url, $update = "daily", $priority = "0.5") { global $config; $url = htmlentities($config["base_url"] . $url); ?> <url> <loc><?php print $url; ?> </loc> <changefreq><?php print $update; ?> </changefreq>
function crawl($url, $path, $website, $rec, $lang) { $html = file_get_html($url); $data = $html->find('div[id=bodyContent] p'); $count = 0; $count2 = 0; $p = parse_url($url); foreach ($data as $i) { $para = strip_tags($i->innertext); $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para); $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket); $re = '/# Split sentences on whitespace between them. (?<= # Begin positive lookbehind. [.!?] # Either an end of sentence punct, | [.!?][\'"] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind. Mr\\. # Skip either "Mr." | Mrs\\. # or "Mrs.", | Ms\\. # or "Ms.", | Jr\\. # or "Jr.", | Dr\\. # or "Dr.", | Prof\\. # or "Prof.", | Sr\\. # or "Sr.", | T\\.V\\.A\\. # or "T.V.A.", | St\\. # or "T.V.A.", | \\s[A-Z]\\. # or initials ex: "George W. Bush", # or... (you get the idea). ) # End negative lookbehind. \\s+ # Split on whitespace between sentences. /ix'; $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY); foreach ($sentences as $sentence) { if (strlen($sentence) > 10 && !strpos($sentence, ':') && (ctype_upper($sentence[0]) || ctype_digit($sentence[0]))) { add_sentence($sentence, $lang); $count++; } } } $data = $html->find('a'); $count2 = 0; $url = ""; foreach ($data as $i) { if (startsWith($i->href, '/wiki/') && !strpos($i->href, ':')) { if (strpos($i->href, '#') > 0) { $i->href = substr($i->href, 0, strpos($i->href, '#')); } // echo "http://".$p["host"].$i->href."<br/>"; add_url("http://" . $p["host"] . $i->href, $website, 3, $rec); $count2++; } } echo $count . " sentences have been added from URL : " . $url . "<br/>"; echo $count2 . " links have been added from URL : " . $url; }
function crawl($data_main) { $patterns = find_website_patterns($data_main['url']); $html = file_get_html($data_main['url']); $count = 0; // COUNT OF THE SENTANCES WE GOT. NO USE BUT WROTE FOR DEBUGGING PURPOSE. foreach ($patterns as $pattern) { $data = $html->find($pattern); foreach ($data as $i) { $para = strip_tags($i->innertext); $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para); $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket); $re = '/# Split sentences on whitespace between them. (?<= # Begin positive lookbehind. [.!?] # Either an end of sentence punct, | [.!?][\'"] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind. Mr\\. # Skip either "Mr." | Mrs\\. # or "Mrs.", | Ms\\. # or "Ms.", | Jr\\. # or "Jr.", | Dr\\. # or "Dr.", | Prof\\. # or "Prof.", | Sr\\. # or "Sr.", | T\\.V\\.A\\. # or "T.V.A.", | St\\. # or "T.V.A.", | \\s[A-Z]\\. # or initials ex: "George W. Bush", # or... (you get the idea). ) # End negative lookbehind. \\s+ # Split on whitespace between sentences. /ix'; $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY); foreach ($sentences as $sentence) { if (strlen($sentence) > 10 && count(get_num_of_words($sentence)) > 2) { preg_replace("/\\s+/", " ", $sentence); $sentence = rtrim($sentence, "<br>"); $sentence = rtrim($sentence, "<br/>"); add_words_into_dic($sentence, $data_main['language']); add_sentence($sentence, $data_main['language']); $count++; } } } } $p = parse_url($data_main['url']); $data = $html->find('a'); $count2 = 0; foreach ($data as $got_url) { if ($got_url->href != "") { if (strpos($got_url->href, $p["host"]) === false && strpos($got_url->href, 'http://') === false && strpos($got_url->href, 'https://') === false) { $got_url->href = 'http://' . $p["host"] . $got_url->href; } $temp_host = parse_url($got_url->href); if ($p["host"] == $temp_host["host"]) { add_url($got_url->href, $data_main); } $count2++; } } }