Exemple #1
0
     $tool_content .= "
          <tr>              
              <td class='smaller'><a href='show.php/$course_code/$ebook_id/$display_id/' target='_blank'>" . q($files[$id_map[$file_id]]) . "</a></td>
              <td><input type='text' name='title[$file_id]' size='30' value='" . q($r->subsection_title) . "'></td>
              <td>" . selection($sections, "sid[$file_id]", $r->sid, 'class="form-control"') . "</td>
              <td class='center' style='width: 50px;'>
                  <input type='hidden' name='oldssid[$file_id]' value='$r->ssid'>
                  <input type='text' class='form-control' name='ssid[$file_id]' value='" . q($r->pssid) . "'>
              </td>
          </tr>";
     unset($files[$id_map[$file_id]]);        
 }
 foreach ($files as $key => $file) {        
     $path = $paths[$key];
     $file_id = $file_ids[$key];
     $title = get_html_title($basedir . $path);
     $tool_content .= "
      <tr class='not_visible'>          
          <td class='smaller'><a href='show.php/$course_code/$ebook_id/_" . q($file) . "' target='_blank'>" . q($file) . "</a></td>
          <td><input type='text' name='title[$file_id]' size='30' value='" . q($title) . "' /></td>
          <td>" . selection($sections, "sid[$file_id]", ' ', 'class="form-control"') . "</td>
          <td class='center' style='width: 50px;'>
             <input class='form-control' type='text' name='ssid[$file_id]'>
         </td>
      </tr>";        
 }
 $tool_content .= "
  <tr>
    <td colspan='3'>&nbsp;</td>
    <td><input class='btn btn-primary' type='submit' name='submit' value='$langSubmit'></td>
  </table>
Exemple #2
0
 function get_other_html()
 {
     $selection = array();
     $levels = array();
     $level_max = array();
     // Tries to find an alternate page to check for "common images" and ignore them
     $this->other_html = false;
     $this->path_query = unify_path_query($this->parsed_url['path'], $this->parsed_url['query']);
     $my_path_len = path_count($this->path_query);
     if ($this->html) {
         if ($this->debug) {
             echo "<!-- Analyzing html: " . strlen($this->html) . " bytes -->\n";
         }
         $regexp = '[a-z]+?:\\/\\/' . preg_quote($this->parsed_url['host']) . '\\/[^\\"\'>]+?';
         if ($this->site) {
             $parsed = parse_url($this->site);
             if ($parsed['host'] != $this->parsed_url['host']) {
                 $regexp .= '|' . preg_quote($this->site, '/') . '\\/[^\\"\'>]+?';
             }
         }
         if ($this->redirected && $this->parsed_redirected['host'] != $this->parsed_url['host']) {
             $regexp .= '|[a-z]+?:\\/\\/' . preg_quote($this->parsed_redirected['host']) . '\\/[^\\"\'>]+?';
         }
         $regexp .= '|[\\/\\.][^\\"\']+?|\\w[^\\"\':]+?';
         $seen = array();
         $visited = array();
         if (preg_match_all("/<a[^>]*\\shref *= *[\"\\']({$regexp})[\"\\']/is", $this->html, $matches, PREG_SET_ORDER)) {
             foreach ($matches as $match) {
                 if (preg_match('/\\.(gif|jpg|zip|png|jpeg|rar|mp[1-4]|mov|mpeg|mpg|pdf|ps|gz|tar|tif)($|\\s)/i', $match[1]) || preg_match('/nofollow/i', $match[0]) || preg_match('/^#/', $match[1]) || preg_match('/\\?cat=\\d+$/i', $match[1]) || preg_match('/(feed|rss|atom|trackback|search|download|gravatar)\\W/i', $match[1])) {
                     continue;
                 }
                 $weight = 1;
                 $url = preg_replace('/&amp;/i', '&', $match[1]);
                 $url = preg_replace('/#.+/i', '', $url);
                 $url = preg_replace('/[\\?&]\\s*$/i', '', $url);
                 // Some urls with void &'s at the end
                 $url = build_full_url(trim($url), $this->url);
                 if (!$url) {
                     continue;
                 }
                 if ($seen[$url]) {
                     continue;
                 }
                 $seen[$url] = true;
                 if ($this->debug) {
                     echo "<!-- Adding before analyzing: {$url} -->\n";
                 }
                 $parsed_match = parse_url($url);
                 $path_query_match = unify_path_query($parsed_match['path'], $parsed_match['query']);
                 $other_path_len = path_count($path_query_match);
                 if ($visited[$path_query_match] || $this->path_query == $path_query_match) {
                     continue;
                 }
                 $visited[$path_query_match] = true;
                 if ($my_path_len > 2 && $my_path_len < $other_path_len && strncmp($this->path_query, $path_query_match, strlen($this->path_query)) == 0) {
                     if ($this->debug) {
                         echo "<!-- Skipped because it is a subpage-->\n";
                     }
                     continue;
                 }
                 $equals = min(path_equals($path_query_match, $this->path_query), $other_path_len - 1);
                 if ($equals > 0 && $other_path_len != $my_path_len) {
                     // TODO: convert these checks in one iteration
                     if (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $this->path_query)) {
                         $c = 4;
                     } elseif (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/*\\d{2,}/).*#', '$1', $this->path_query)) {
                         $c = 3;
                     } elseif (preg_replace('#.*?(/\\d{4,}/*\\d{2,}/).*#', '$1', $path_query_match) == preg_replace('#.*?(/\\d{4,}/*\\d{2,}/).*#', '$1', $this->path_query)) {
                         $c = 2;
                     }
                     $equals = max(0, $equals - $c);
                 }
                 // Penalize with a level if one has query and the other does not
                 if (empty($parsed_match['query']) != empty($this->parsed_url['query'])) {
                     $equals = $equals - 2;
                 }
                 $distance = levenshtein($path_query_match, $this->path_query) * min(strlen($path_query_match), strlen($this->path_query)) / max(strlen($path_query_match), strlen($this->path_query));
                 $item = array($url, $distance);
                 $levels[$equals][] = $item;
                 if ($this->debug) {
                     echo "<!-- Adding ({$equals}, {$distance}): " . $match[1] . " ({$path_query_match}) -->\n";
                 }
             }
             // Insert in selection ordered by level and the distance
             krsort($levels);
             foreach ($levels as $level => $items) {
                 usort($items, 'sort_url_distance_items');
                 foreach ($items as $item) {
                     $selection[] = $item[0];
                 }
             }
         }
         if (count($selection) > 2) {
             // we avoid those simple pages with few links to other pages
             $max_to_check = max(2, min(4, count($selection) / 5));
             $n = $checked = $same_title = $other_title = $images_total = 0;
             $paths = array();
             $paths_visited = array();
             $paths[path_sub_path($this->path_query, 2)] = $my_path_len;
             foreach ($selection as $url) {
                 if ($checked > 10) {
                     break;
                 }
                 $parsed = parse_url($url);
                 $unified = unify_path_query($parsed['path'], $parsed['query']);
                 $first_paths = path_sub_path($unified, 2);
                 $paths_visited[$first_paths] += 1;
                 if ($paths_visited[$first_paths] > 2) {
                     if ($this->debug) {
                         echo "<!-- Ignoring {$url} by equal path: " . $first_paths . " -->\n";
                     }
                     continue;
                 }
                 $paths_len = path_count($unified);
                 if ($paths[$first_paths] && $paths_len < $paths[$first_paths]) {
                     // Don't get twice a page with similar but shorter paths
                     if ($this->debug) {
                         echo "<!-- Ignoring {$url} by previous path: {$first_paths} and lenght: {$paths_len}] -->\n";
                     }
                     continue;
                 }
                 if ($this->debug) {
                     echo "<!-- Checking: {$url} -->\n";
                 }
                 $checked++;
                 $res = get_url($url, $this->url, null, false);
                 if (!$res || !preg_match('/text\\/html/i', $res['content_type'])) {
                     continue;
                 }
                 if ($res['location'] != $url) {
                     $location_parsed = parse_url($res['location']);
                     $location_unified = unify_path_query($location_parsed['path'], $location_parsed['query']);
                     if ($location_parsed['host'] != $parsed['host'] && $location_parsed['host'] != $this->parsed_redirected['host']) {
                         if ($this->debug) {
                             echo "<!-- Redirected to another host: " . $res['location'] . ", skipping -->\n";
                         }
                         continue;
                     } elseif ($location_unified == $this->path_query) {
                         if ($this->debug) {
                             echo "<!-- Redirected to same address: " . $res['location'] . ", skipping -->\n";
                         }
                         continue;
                     } elseif (path_count($location_unified) < $my_path_len && path_count($location_unified) < path_count($unified)) {
                         if ($this->debug) {
                             echo "<!-- Redirected to a shorter path: {$url} -> " . $res['location'] . ", skipping -->\n";
                         }
                         continue;
                     }
                 }
                 $images_count = preg_match_all('/<img .+?>/is', $res['content'], $dummy);
                 if (!$images_count) {
                     continue;
                 }
                 $images_total += $images_count;
                 // Check if it has the same title
                 if (empty($this->title) || $this->title == get_html_title($res['content'])) {
                     $same_title++;
                     // Next iff we found less that 3 pages, otherwise we asume all pages have same title
                     if ($same_title < 3 || $other_title) {
                         if ($this->debug) {
                             echo "<!-- Skipping: same title {$url} -->\n";
                         }
                         continue;
                     }
                 } else {
                     $other_title++;
                 }
                 if ($this->debug) {
                     echo "<!-- Other: read {$url} -->\n";
                 }
                 $paths[$first_paths] = max($paths_len, $paths[$first_paths]);
                 $n++;
                 $this->other_html .= $this->shorten_html($res['content'], 100000) . "<!-- END part {$n} -->\n";
                 if ($n > $max_to_check || $images_total > $this->images_count * 2) {
                     break;
                 }
             }
         }
     }
     return $this->other_html;
 }
function get_http_meta($url, $depth = 0, $status_chain = '', $url_chain = '')
{
    //echo "get_http_meta('$url', $depth)\n";
    // how deep to follow http redirects
    if ($depth > 3) {
        return;
    }
    $depth++;
    $connect_timeout = 3;
    $read_timeout = 6;
    ini_set('default_socket_timeout', $connect_timeout);
    $http_meta = get_empty_http_meta();
    $purl = parse_url($url);
    // note: this seems to fail for all addresses
    // if a domain name, is it valid?
    /*
        if (!preg_match("/^[0-9]{3}\.[0-9]{3}\.[0-9]{3}\.[0-9]{3}$/", $purl['host'])) {
       $ip = gethostbyname($purl['host']);
       if ($ip == $purl['host']) {
           echo "(name resolution failed) "; 
           $http_meta['state'] = 2;
           return $http_meta;
       }
        }
    */
    // connect and get http return code
    if (array_key_exists('port', $purl)) {
        $port = $purl['port'];
    } else {
        $port = 80;
    }
    $sock = fsockopen($purl['host'], $port, $errno, $errstr, $connect_timeout);
    if (!$sock) {
        echo "(connection error - {$errstr} ({$errno})) ";
        $http_meta['state'] = 3;
        return $http_meta;
    }
    stream_set_blocking($sock, FALSE);
    stream_set_timeout($sock, $read_timeout);
    $url_path = '/';
    if (array_key_exists('path', $purl)) {
        $url_path = $purl['path'];
        if (array_key_exists('query', $purl)) {
            $url_path .= '?' . $purl['query'];
        }
    }
    // we set 'accept-language' as some sites will have different <title> depending on your geoip
    // e.g. flickr
    // some sites seem to ignore you if you don't send a user-agent
    // e.g. digg (but maybe that's for the better)
    fputs($sock, "GET {$url_path} HTTP/1.1\r\n");
    fputs($sock, "Host: " . $purl['host'] . "\r\n");
    fputs($sock, "Connection: close\r\n");
    fputs($sock, "Accept-Language: en-gb, en;q=0.8\r\n");
    fputs($sock, "User-Agent: URL catcher\r\n");
    fputs($sock, "\r\n");
    $status = socket_get_status($sock);
    if ($status['timed_out']) {
        echo "(status timed_out) ";
        $http_meta['state'] = 4;
        return $http_meta;
    }
    if (feof($sock)) {
        echo "(socket early EOF) ";
        $http_meta['state'] = 5;
        return $http_meta;
    }
    $contents = stream_get_contents($sock, 1024 * 8);
    if ($contents === FALSE) {
        echo "stream_get_contents returned FALSE ";
        return $http_meta;
    }
    fclose($sock);
    $http_status = get_http_status($contents);
    //echo "http_status=$http_status\n";
    if (!is_numeric($http_status)) {
        return $http_meta;
    }
    $http_meta['http_status'] = $http_status;
    $http_meta['location'] = $url;
    // handle different status codes
    if ($http_status == "404") {
        echo "(http 404) ";
        $http_meta['state'] = 6;
    } elseif (substr($http_status, 0, 1) == 4) {
        echo "(http other 40x) ";
        $http_meta['state'] = 7;
    } elseif (substr($http_status, 0, 1) == 5) {
        echo "(http other 50x) ";
        $http_meta['state'] = 8;
    } elseif ($http_status == "301" or $http_status == "302") {
        // 301 - Moved Permanently
        // 302 - Found / Moved
        $new_url = get_http_location($contents);
        if (!$new_url) {
            $http_meta['state'] = 0;
            return $http_meta;
        }
        // recurse recurse recurse
        // discard our meta in favour of the deepest url
        $http_meta = get_http_meta($new_url, $depth);
        // set any meta we want to persist here
        $http_meta['redirect'] = 1;
    } elseif ($http_status == "200") {
        $http_meta['state'] = 1;
        $http_meta['content_length'] = get_http_content_length($contents);
        $http_meta['content_type'] = get_http_content_type($contents);
        $check_html = 0;
        if ($http_meta['content_type']) {
            $check_html = preg_match("/^text\\/html/", $http_meta['content_type']);
        } else {
            $check_html = !preg_match("/(jpg|jpeg|gif|png|wav|mp3|avi|wmv|mpg)\$/", $url);
        }
        if ($check_html) {
            $http_meta['html_title'] = get_html_title($contents);
            if (!$http_meta['html_title']) {
                $http_meta['html_title'] = get_html_h1($contents);
            }
        } else {
            //echo "(non text/html type) ";
        }
        // html meta redirect?
        // this could be placed better to avoid doing all the work above
        $html_meta_redirect = get_html_meta_redirect($contents);
        if ($html_meta_redirect) {
            $http_meta = get_http_meta($html_meta_redirect, $depth);
            $http_meta['redirect'] = 1;
        }
    }
    return $http_meta;
}