function crawl($url, $path, $website, $rec, $lang) { $html = file_get_html($url); $data = $html->find('div[id=bodyContent] p'); $count = 0; $count2 = 0; $p = parse_url($url); foreach ($data as $i) { $para = strip_tags($i->innertext); $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para); $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket); $re = '/# Split sentences on whitespace between them. (?<= # Begin positive lookbehind. [.!?] # Either an end of sentence punct, | [.!?][\'"] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind. Mr\\. # Skip either "Mr." | Mrs\\. # or "Mrs.", | Ms\\. # or "Ms.", | Jr\\. # or "Jr.", | Dr\\. # or "Dr.", | Prof\\. # or "Prof.", | Sr\\. # or "Sr.", | T\\.V\\.A\\. # or "T.V.A.", | St\\. # or "T.V.A.", | \\s[A-Z]\\. # or initials ex: "George W. Bush", # or... (you get the idea). ) # End negative lookbehind. \\s+ # Split on whitespace between sentences. /ix'; $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY); foreach ($sentences as $sentence) { if (strlen($sentence) > 10 && !strpos($sentence, ':') && (ctype_upper($sentence[0]) || ctype_digit($sentence[0]))) { add_sentence($sentence, $lang); $count++; } } } $data = $html->find('a'); $count2 = 0; $url = ""; foreach ($data as $i) { if (startsWith($i->href, '/wiki/') && !strpos($i->href, ':')) { if (strpos($i->href, '#') > 0) { $i->href = substr($i->href, 0, strpos($i->href, '#')); } // echo "http://".$p["host"].$i->href."<br/>"; add_url("http://" . $p["host"] . $i->href, $website, 3, $rec); $count2++; } } echo $count . " sentences have been added from URL : " . $url . "<br/>"; echo $count2 . " links have been added from URL : " . $url; }
function crawl($data_main) { $patterns = find_website_patterns($data_main['url']); $html = file_get_html($data_main['url']); $count = 0; // COUNT OF THE SENTANCES WE GOT. NO USE BUT WROTE FOR DEBUGGING PURPOSE. foreach ($patterns as $pattern) { $data = $html->find($pattern); foreach ($data as $i) { $para = strip_tags($i->innertext); $no_bracket = preg_replace("/\\([^)]+\\)/", '', $para); $no_bracket = preg_replace("/\\[[^)]+\\]/", '', $no_bracket); $re = '/# Split sentences on whitespace between them. (?<= # Begin positive lookbehind. [.!?] # Either an end of sentence punct, | [.!?][\'"] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind. Mr\\. # Skip either "Mr." | Mrs\\. # or "Mrs.", | Ms\\. # or "Ms.", | Jr\\. # or "Jr.", | Dr\\. # or "Dr.", | Prof\\. # or "Prof.", | Sr\\. # or "Sr.", | T\\.V\\.A\\. # or "T.V.A.", | St\\. # or "T.V.A.", | \\s[A-Z]\\. # or initials ex: "George W. Bush", # or... (you get the idea). ) # End negative lookbehind. \\s+ # Split on whitespace between sentences. /ix'; $sentences = preg_split($re, $no_bracket, -1, PREG_SPLIT_NO_EMPTY); foreach ($sentences as $sentence) { if (strlen($sentence) > 10 && count(get_num_of_words($sentence)) > 2) { preg_replace("/\\s+/", " ", $sentence); $sentence = rtrim($sentence, "<br>"); $sentence = rtrim($sentence, "<br/>"); add_words_into_dic($sentence, $data_main['language']); add_sentence($sentence, $data_main['language']); $count++; } } } } $p = parse_url($data_main['url']); $data = $html->find('a'); $count2 = 0; foreach ($data as $got_url) { if ($got_url->href != "") { if (strpos($got_url->href, $p["host"]) === false && strpos($got_url->href, 'http://') === false && strpos($got_url->href, 'https://') === false) { $got_url->href = 'http://' . $p["host"] . $got_url->href; } $temp_host = parse_url($got_url->href); if ($p["host"] == $temp_host["host"]) { add_url($got_url->href, $data_main); } $count2++; } } }