Beispiel #1
0
      */
     $to = have_url($link, $crawl_tag);
     /**
      * If the link is not in the table, add it
      */
     if (!$to) {
         /**
          * Output that we're adding a URL if we're in verbose mode
          */
         if (isset($_GET['debug'])) {
             echo "<li>Adding url " . urldecode($link) . " to list</li>";
         }
         /**
          * Add URL to table, grab link ID #
          */
         $to = add_url($link, $clicks, $crawl_tag);
     }
     /**
      * If debug mode, indicate that we're adding a link
      */
     if (isset($_GET['debug'])) {
         echo "<li>Adding link from here to " . urldecode($link) . "</li>";
     }
     /**
      * Add the link to the links table
      */
     add_link($id, $to);
 }
 /**
  * If the server did not report a size (in which case cURL returns '-1'), 
  * use the size of the cURL as the file size, otherwise, trust the server
function generate()
{
    global $upload_dir, $db, $current_user, $main_smarty, $the_template;
    // the file name that should be uploaded
    $file_tmp = $_FILES['upload_file']['tmp_name'];
    $file_name = $_FILES['upload_file']['name'];
    $unique_file_name = "tingtest1.ktr";
    $upload_dir = get_misc_data('upload_directory');
    $upload_path = mnmpath . $upload_dir . $unique_file_name;
    $upload = move_uploaded_file($file_tmp, $upload_path);
    /*create new ktr file*/
    $tmpDir = 'excel-to-target_schema.ktr';
    $newDir = '0.ktr';
    copy($tmpDir, $newDir);
    $a1 = $_POST["sheet"];
    $b1 = $_POST["row"];
    $c1 = $_POST["col"];
    $a = array($a1, "", "");
    $b = array($b1, "", "");
    $c = array($c1, "", "");
    $sheets = array($a, $b, $c);
    $spd = $_POST["spd"];
    $drd = $_POST["drd"];
    $start = $_POST["start"];
    $end = $_POST["end"];
    $start2 = $_POST["start2"];
    $end2 = $_POST["end2"];
    $location = $_POST["location"];
    $aggrtype = $_POST["aggrtype"];
    $location2 = $_POST["location2"];
    $aggrtype2 = $_POST["aggrtype2"];
    $process = new Process_excel();
    $arr_Sheet_name = $process->getSheetName('census.xls');
    $arr_Header = $process->getHeader('dataverse_census.xls', 0, 11, 'A');
    //print_r ($process->getHeader('tradestatistics.xls', 1, 23, 'A'));
    echo $start;
    /* adding url*/
    add_url(0, 'http://colfusion.exp.sis.pitt.edu/colfusion/upload_raw_data/irule_dataverse_census.xls');
    echo "hello";
    //add_sheets(0, $sheets);
    addSheets(0, "Table HH-1", 10, 0);
    addConstants('Spd', $spd, 'Date', 'yyyyMMdd');
    addConstants('Drd', $drd, 'Date', 'yyyyMMdd');
    add_excel_input_fields($arr_Header);
    add_sample_target();
    //$arr_Header - $array_no_need_normalize
    $no_need_Array = array($start, $end, $location, $aggrtype);
    //print_r ($no_need_Array);
    //print_r($arr_Header);
    $result = array_diff($arr_Header, $no_need_Array);
    //	print_r($result);
    /*--------------------the second $result are from user , the first $result need to use AJAX to present to user------*/
    add_normalizer($result, $result);
    //for variable of star
    if ($start != "") {
        //$start from excel
        update_target('Start', $start);
    } else {
        //$start from user input
        addConstants('Start_from_input', $start2, 'Date', 'yyyyMMdd');
        update_target('Start', 'Start_from_input');
    }
    //for variable of end
    if ($end != "") {
        //$start from excel
        update_target('End', $end);
    } else {
        //$start from user input
        addConstants('End_from_input', $end2, 'Date', 'yyyyMMdd');
        update_target('End', 'End_from_input');
    }
    //for variable of location
    if ($location != "") {
        echo $location;
        //$start from excel
        update_target('Location', $location);
    } else {
        //$start from user input
        echo $location2;
        addConstants('Location_from_input', $location2, 'String', '');
        update_target('Location', 'Location_from_input');
    }
    //for variable of aggrType
    if ($aggrtype != "") {
        //$start from excel
        update_target('AggrType', $aggrtype);
    } else {
        //$start from user input
        addConstants('AggrType_from_input', $aggrtype2, 'String', '');
        update_target('AggrType', 'AggrType_from_input');
    }
    //add_normalize($ArrayKey,$ArrayValue);
    echo $start;
    echo "........1<br/>";
    echo $start2;
    echo ".......2<br/>";
    echo $end;
    echo "..........3<br/>";
    echo $end2;
    echo "...........4<br/>";
    echo $location;
    echo "............5<br/>";
    echo $location2;
    echo "..........6<br/>";
    echo $aggrtype;
    echo "..........7<br/>";
    echo $aggrtype2;
    echo "............8<br/>";
    echo $spd;
    echo ".........9<br/>";
    echo $drd;
    echo "..........10<br/>";
}
print '<?xml version="1.0" encoding="UTF-8"?>';
?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">
<?php 
include "config.php";
# we don't include posts.php and links.php here because they're disallowed by robots.txt anyway.
add_url("index.php", "daily", "0.8");
add_url("blogs.php", "weekly", "0.1");
add_url("stats.php", "weekly", "0.3");
add_url("blog_search.php", "daily", "0.9");
add_url("search.php", "monthly", "0.5");
add_url("post.php", "yearly", "0.1");
add_url("paper.php", "monthly", "0.6");
add_url("papers.php", "daily", "0.1");
if ($config['do_wiki']) {
    add_url("wiki/doku.php", "weekly", "0.8");
}
function add_url($url, $update = "daily", $priority = "0.5")
{
    global $config;
    $url = htmlentities($config["base_url"] . $url);
    ?>
<url>
   <loc><?php 
    print $url;
    ?>
</loc>
   <changefreq><?php 
    print $update;
    ?>
</changefreq>
function crawl($url, $path, $website, $rec, $lang)
{
    $html = file_get_html($url);
    $data = $html->find('div[id=bodyContent] p');
    $count = 0;
    $count2 = 0;
    $p = parse_url($url);
    foreach ($data as $i) {
        $para = strip_tags($i->innertext);
        $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para);
        $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket);
        $re = '/# Split sentences on whitespace between them.
			(?<=                # Begin positive lookbehind.
			  [.!?]             # Either an end of sentence punct,
			| [.!?][\'"]        # or end of sentence punct and quote.
			)                   # End positive lookbehind.
			(?<!                # Begin negative lookbehind.
			  Mr\\.              # Skip either "Mr."
			| Mrs\\.             # or "Mrs.",
			| Ms\\.              # or "Ms.",
			| Jr\\.              # or "Jr.",
			| Dr\\.              # or "Dr.",
			| Prof\\.            # or "Prof.",
			| Sr\\.              # or "Sr.",
			| T\\.V\\.A\\.         # or "T.V.A.",
			| St\\.         # or "T.V.A.",
			| \\s[A-Z]\\.              # or initials ex: "George W. Bush",
								# or... (you get the idea).
			)                   # End negative lookbehind.
			\\s+                 # Split on whitespace between sentences.
			/ix';
        $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY);
        foreach ($sentences as $sentence) {
            if (strlen($sentence) > 10 && !strpos($sentence, ':') && (ctype_upper($sentence[0]) || ctype_digit($sentence[0]))) {
                add_sentence($sentence, $lang);
                $count++;
            }
        }
    }
    $data = $html->find('a');
    $count2 = 0;
    $url = "";
    foreach ($data as $i) {
        if (startsWith($i->href, '/wiki/') && !strpos($i->href, ':')) {
            if (strpos($i->href, '#') > 0) {
                $i->href = substr($i->href, 0, strpos($i->href, '#'));
            }
            //		echo "http://".$p["host"].$i->href."<br/>";
            add_url("http://" . $p["host"] . $i->href, $website, 3, $rec);
            $count2++;
        }
    }
    echo $count . " sentences have been added from URL : " . $url . "<br/>";
    echo $count2 . " links have been added from URL : " . $url;
}
function crawl($data_main)
{
    $patterns = find_website_patterns($data_main['url']);
    $html = file_get_html($data_main['url']);
    $count = 0;
    // COUNT OF THE SENTANCES WE GOT. NO USE BUT WROTE FOR DEBUGGING PURPOSE.
    foreach ($patterns as $pattern) {
        $data = $html->find($pattern);
        foreach ($data as $i) {
            $para = strip_tags($i->innertext);
            $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para);
            $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket);
            $re = '/# Split sentences on whitespace between them.
				(?<=                # Begin positive lookbehind.
				  [.!?]             # Either an end of sentence punct,
				| [.!?][\'"]        # or end of sentence punct and quote.
				)                   # End positive lookbehind.
				(?<!                # Begin negative lookbehind.
				  Mr\\.              # Skip either "Mr."
				| Mrs\\.             # or "Mrs.",
				| Ms\\.              # or "Ms.",
				| Jr\\.              # or "Jr.",
				| Dr\\.              # or "Dr.",
				| Prof\\.            # or "Prof.",
				| Sr\\.              # or "Sr.",
				| T\\.V\\.A\\.         # or "T.V.A.",
				| St\\.         # or "T.V.A.",
				| \\s[A-Z]\\.              # or initials ex: "George W. Bush",
									# or... (you get the idea).
				)                   # End negative lookbehind.
				\\s+                 # Split on whitespace between sentences.
				/ix';
            $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY);
            foreach ($sentences as $sentence) {
                if (strlen($sentence) > 10 && count(get_num_of_words($sentence)) > 2) {
                    preg_replace("/\\s+/", " ", $sentence);
                    $sentence = rtrim($sentence, "<br>");
                    $sentence = rtrim($sentence, "<br/>");
                    add_words_into_dic($sentence, $data_main['language']);
                    add_sentence($sentence, $data_main['language']);
                    $count++;
                }
            }
        }
    }
    $p = parse_url($data_main['url']);
    $data = $html->find('a');
    $count2 = 0;
    foreach ($data as $got_url) {
        if ($got_url->href != "") {
            if (strpos($got_url->href, $p["host"]) === false && strpos($got_url->href, 'http://') === false && strpos($got_url->href, 'https://') === false) {
                $got_url->href = 'http://' . $p["host"] . $got_url->href;
            }
            $temp_host = parse_url($got_url->href);
            if ($p["host"] == $temp_host["host"]) {
                add_url($got_url->href, $data_main);
            }
            $count2++;
        }
    }
}