$tool_content .= " <tr> <td class='smaller'><a href='show.php/$course_code/$ebook_id/$display_id/' target='_blank'>" . q($files[$id_map[$file_id]]) . "</a></td> <td><input type='text' name='title[$file_id]' size='30' value='" . q($r->subsection_title) . "'></td> <td>" . selection($sections, "sid[$file_id]", $r->sid, 'class="form-control"') . "</td> <td class='center' style='width: 50px;'> <input type='hidden' name='oldssid[$file_id]' value='$r->ssid'> <input type='text' class='form-control' name='ssid[$file_id]' value='" . q($r->pssid) . "'> </td> </tr>"; unset($files[$id_map[$file_id]]); } foreach ($files as $key => $file) { $path = $paths[$key]; $file_id = $file_ids[$key]; $title = get_html_title($basedir . $path); $tool_content .= " <tr class='not_visible'> <td class='smaller'><a href='show.php/$course_code/$ebook_id/_" . q($file) . "' target='_blank'>" . q($file) . "</a></td> <td><input type='text' name='title[$file_id]' size='30' value='" . q($title) . "' /></td> <td>" . selection($sections, "sid[$file_id]", ' ', 'class="form-control"') . "</td> <td class='center' style='width: 50px;'> <input class='form-control' type='text' name='ssid[$file_id]'> </td> </tr>"; } $tool_content .= " <tr> <td colspan='3'> </td> <td><input class='btn btn-primary' type='submit' name='submit' value='$langSubmit'></td> </table>
function get_other_html() { $selection = array(); $levels = array(); $level_max = array(); // Tries to find an alternate page to check for "common images" and ignore them $this->other_html = false; $this->path_query = unify_path_query($this->parsed_url['path'], $this->parsed_url['query']); $my_path_len = path_count($this->path_query); if ($this->html) { if ($this->debug) { echo "<!-- Analyzing html: " . strlen($this->html) . " bytes -->\n"; } $regexp = '[a-z]+?:\\/\\/' . preg_quote($this->parsed_url['host']) . '\\/[^\\"\'>]+?'; if ($this->site) { $parsed = parse_url($this->site); if ($parsed['host'] != $this->parsed_url['host']) { $regexp .= '|' . preg_quote($this->site, '/') . '\\/[^\\"\'>]+?'; } } if ($this->redirected && $this->parsed_redirected['host'] != $this->parsed_url['host']) { $regexp .= '|[a-z]+?:\\/\\/' . preg_quote($this->parsed_redirected['host']) . '\\/[^\\"\'>]+?'; } $regexp .= '|[\\/\\.][^\\"\']+?|\\w[^\\"\':]+?'; $seen = array(); $visited = array(); if (preg_match_all("/<a[^>]*\\shref *= *[\"\\']({$regexp})[\"\\']/is", $this->html, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { if (preg_match('/\\.(gif|jpg|zip|png|jpeg|rar|mp[1-4]|mov|mpeg|mpg|pdf|ps|gz|tar|tif)($|\\s)/i', $match[1]) || preg_match('/nofollow/i', $match[0]) || preg_match('/^#/', $match[1]) || preg_match('/\\?cat=\\d+$/i', $match[1]) || preg_match('/(feed|rss|atom|trackback|search|download|gravatar)\\W/i', $match[1])) { continue; } $weight = 1; $url = preg_replace('/&/i', '&', $match[1]); $url = preg_replace('/#.+/i', '', $url); $url = preg_replace('/[\\?&]\\s*$/i', '', $url); // Some urls with void &'s at the end $url = build_full_url(trim($url), $this->url); if (!$url) { continue; } if ($seen[$url]) { continue; } $seen[$url] = true; if ($this->debug) { echo "<!-- Adding before analyzing: {$url} -->\n"; } $parsed_match = parse_url($url); $path_query_match = unify_path_query($parsed_match['path'], $parsed_match['query']); $other_path_len = path_count($path_query_match); if ($visited[$path_query_match] || $this->path_query == $path_query_match) { continue; } $visited[$path_query_match] = true; if ($my_path_len > 2 && $my_path_len < $other_path_len && strncmp($this->path_query, $path_query_match, strlen($this->path_query)) == 0) { if ($this->debug) { echo "<!-- Skipped because it is a subpage-->\n"; } continue; } $equals = min(path_equals($path_query_match, $this->path_query), $other_path_len - 1); if ($equals > 0 && $other_path_len != $my_path_len) { // TODO: convert these checks in one iteration if (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $this->path_query)) { $c = 4; } elseif (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $this->path_query)) { $c = 3; } elseif (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/).*#', '$1', $this->path_query)) { $c = 2; } $equals = max(0, $equals - $c); } // Penalize with a level if one has query and the other does not if (empty($parsed_match['query']) != empty($this->parsed_url['query'])) { $equals = $equals - 2; } $distance = levenshtein($path_query_match, $this->path_query) * min(strlen($path_query_match), strlen($this->path_query)) / max(strlen($path_query_match), strlen($this->path_query)); $item = array($url, $distance); $levels[$equals][] = $item; if ($this->debug) { echo "<!-- Adding ({$equals}, {$distance}): " . $match[1] . " ({$path_query_match}) -->\n"; } } // Insert in selection ordered by level and the distance krsort($levels); foreach ($levels as $level => $items) { usort($items, 'sort_url_distance_items'); foreach ($items as $item) { $selection[] = $item[0]; } } } if (count($selection) > 2) { // we avoid those simple pages with few links to other pages $max_to_check = max(2, min(4, count($selection) / 5)); $n = $checked = $same_title = $other_title = $images_total = 0; $paths = array(); $paths_visited = array(); $paths[path_sub_path($this->path_query, 2)] = $my_path_len; foreach ($selection as $url) { if ($checked > 10) { break; } $parsed = parse_url($url); $unified = unify_path_query($parsed['path'], $parsed['query']); $first_paths = path_sub_path($unified, 2); $paths_visited[$first_paths] += 1; if ($paths_visited[$first_paths] > 2) { if ($this->debug) { echo "<!-- Ignoring {$url} by equal path: " . $first_paths . " -->\n"; } continue; } $paths_len = path_count($unified); if ($paths[$first_paths] && $paths_len < $paths[$first_paths]) { // Don't get twice a page with similar but shorter paths if ($this->debug) { echo "<!-- Ignoring {$url} by previous path: {$first_paths} and lenght: {$paths_len}] -->\n"; } continue; } if ($this->debug) { echo "<!-- Checking: {$url} -->\n"; } $checked++; $res = get_url($url, $this->url, null, false); if (!$res || !preg_match('/text\\/html/i', $res['content_type'])) { continue; } if ($res['location'] != $url) { $location_parsed = parse_url($res['location']); $location_unified = unify_path_query($location_parsed['path'], $location_parsed['query']); if ($location_parsed['host'] != $parsed['host'] && $location_parsed['host'] != $this->parsed_redirected['host']) { if ($this->debug) { echo "<!-- Redirected to another host: " . $res['location'] . ", skipping -->\n"; } continue; } elseif ($location_unified == $this->path_query) { if ($this->debug) { echo "<!-- Redirected to same address: " . $res['location'] . ", skipping -->\n"; } continue; } elseif (path_count($location_unified) < $my_path_len && path_count($location_unified) < path_count($unified)) { if ($this->debug) { echo "<!-- Redirected to a shorter path: {$url} -> " . $res['location'] . ", skipping -->\n"; } continue; } } $images_count = preg_match_all('/<img .+?>/is', $res['content'], $dummy); if (!$images_count) { continue; } $images_total += $images_count; // Check if it has the same title if (empty($this->title) || $this->title == get_html_title($res['content'])) { $same_title++; // Next iff we found less that 3 pages, otherwise we asume all pages have same title if ($same_title < 3 || $other_title) { if ($this->debug) { echo "<!-- Skipping: same title {$url} -->\n"; } continue; } } else { $other_title++; } if ($this->debug) { echo "<!-- Other: read {$url} -->\n"; } $paths[$first_paths] = max($paths_len, $paths[$first_paths]); $n++; $this->other_html .= $this->shorten_html($res['content'], 100000) . "<!-- END part {$n} -->\n"; if ($n > $max_to_check || $images_total > $this->images_count * 2) { break; } } } } return $this->other_html; }
function get_http_meta($url, $depth = 0, $status_chain = '', $url_chain = '') { //echo "get_http_meta('$url', $depth)\n"; // how deep to follow http redirects if ($depth > 3) { return; } $depth++; $connect_timeout = 3; $read_timeout = 6; ini_set('default_socket_timeout', $connect_timeout); $http_meta = get_empty_http_meta(); $purl = parse_url($url); // note: this seems to fail for all addresses // if a domain name, is it valid? /* if (!preg_match("/^[0-9]{3}\.[0-9]{3}\.[0-9]{3}\.[0-9]{3}$/", $purl['host'])) { $ip = gethostbyname($purl['host']); if ($ip == $purl['host']) { echo "(name resolution failed) "; $http_meta['state'] = 2; return $http_meta; } } */ // connect and get http return code if (array_key_exists('port', $purl)) { $port = $purl['port']; } else { $port = 80; } $sock = fsockopen($purl['host'], $port, $errno, $errstr, $connect_timeout); if (!$sock) { echo "(connection error - {$errstr} ({$errno})) "; $http_meta['state'] = 3; return $http_meta; } stream_set_blocking($sock, FALSE); stream_set_timeout($sock, $read_timeout); $url_path = '/'; if (array_key_exists('path', $purl)) { $url_path = $purl['path']; if (array_key_exists('query', $purl)) { $url_path .= '?' . $purl['query']; } } // we set 'accept-language' as some sites will have different <title> depending on your geoip // e.g. flickr // some sites seem to ignore you if you don't send a user-agent // e.g. digg (but maybe that's for the better) fputs($sock, "GET {$url_path} HTTP/1.1\r\n"); fputs($sock, "Host: " . $purl['host'] . "\r\n"); fputs($sock, "Connection: close\r\n"); fputs($sock, "Accept-Language: en-gb, en;q=0.8\r\n"); fputs($sock, "User-Agent: URL catcher\r\n"); fputs($sock, "\r\n"); $status = socket_get_status($sock); if ($status['timed_out']) { echo "(status timed_out) "; $http_meta['state'] = 4; return $http_meta; } if (feof($sock)) { echo "(socket early EOF) "; $http_meta['state'] = 5; return $http_meta; } $contents = stream_get_contents($sock, 1024 * 8); if ($contents === FALSE) { echo "stream_get_contents returned FALSE "; return $http_meta; } fclose($sock); $http_status = get_http_status($contents); //echo "http_status=$http_status\n"; if (!is_numeric($http_status)) { return $http_meta; } $http_meta['http_status'] = $http_status; $http_meta['location'] = $url; // handle different status codes if ($http_status == "404") { echo "(http 404) "; $http_meta['state'] = 6; } elseif (substr($http_status, 0, 1) == 4) { echo "(http other 40x) "; $http_meta['state'] = 7; } elseif (substr($http_status, 0, 1) == 5) { echo "(http other 50x) "; $http_meta['state'] = 8; } elseif ($http_status == "301" or $http_status == "302") { // 301 - Moved Permanently // 302 - Found / Moved $new_url = get_http_location($contents); if (!$new_url) { $http_meta['state'] = 0; return $http_meta; } // recurse recurse recurse // discard our meta in favour of the deepest url $http_meta = get_http_meta($new_url, $depth); // set any meta we want to persist here $http_meta['redirect'] = 1; } elseif ($http_status == "200") { $http_meta['state'] = 1; $http_meta['content_length'] = get_http_content_length($contents); $http_meta['content_type'] = get_http_content_type($contents); $check_html = 0; if ($http_meta['content_type']) { $check_html = preg_match("/^text\\/html/", $http_meta['content_type']); } else { $check_html = !preg_match("/(jpg|jpeg|gif|png|wav|mp3|avi|wmv|mpg)\$/", $url); } if ($check_html) { $http_meta['html_title'] = get_html_title($contents); if (!$http_meta['html_title']) { $http_meta['html_title'] = get_html_h1($contents); } } else { //echo "(non text/html type) "; } // html meta redirect? // this could be placed better to avoid doing all the work above $html_meta_redirect = get_html_meta_redirect($contents); if ($html_meta_redirect) { $http_meta = get_http_meta($html_meta_redirect, $depth); $http_meta['redirect'] = 1; } } return $http_meta; }