function GetStandard($url, $redirection) { global $HTTP_ERROR; if ($this->username) { $this->request_headers['Authorization'] = "Basic " . base64_encode($this->username . ":" . $this->password); } if (($parsed_url = parse_url($url)) !== FALSE) { $this->end_url = $url; $this->body = ''; $this->request_headers['Host'] = $parsed_url['host']; $ipaddr = gethostbyname($parsed_url['host']); if ($ipaddr == $parsed_url['host']) { $this->errstr = $HTTP_ERROR[CURLE_COULDNT_RESOLVE_HOST]; return FALSE; } if (empty($parsed_url['port'])) { $parsed_url['port'] = 80; } if (($socket = @fsockopen($ipaddr, $parsed_url['port'], $errno, $errstr, $this->connect_timeout)) !== FALSE) { stream_set_timeout($socket, $this->read_timeout); fputs($socket, $this->PrepareHeaders($parsed_url)); $headers_done = FALSE; $start_time = $this->microtime_float(); while ($line = fgets($socket, 8192)) { if ($line === FALSE) { $this->errstr = $HTTP_ERROR[CURLE_RECV_ERROR]; fclose($socket); return FALSE; } if (!$headers_done && preg_match('|^\\s+$|', $line)) { $headers_done = TRUE; continue; } if ($headers_done) { $this->body .= $line; } else { $this->ReadResponseHeader(null, $line); } } $end_time = $this->microtime_float(); fclose($socket); $this->request_info['size_download'] = strlen($this->body); $this->request_info['speed_download'] = sprintf('%.2f', $this->request_info['size_download'] / ($end_time - $start_time) / 1024); if ($redirection && $this->response_headers['location']) { // Get the new URL to access $new_url = RelativeToAbsolute($url, $this->response_headers['location']); // Clear the previous response headers unset($this->response_headers); unset($this->raw_response_headers); $this->redirects++; $this->end_url = $new_url; if ($this->redirects > $this->max_redirects) { $this->errstr = $HTTP_ERROR[CURLE_TOO_MANY_REDIRECTS]; return FALSE; } return $this->GetStandard($new_url, $redirection); } else { if ($this->response_headers['status_code'] >= 300) { $this->errstr = sprintf($HTTP_ERROR[CURLE_HTTP_RETURNED_ERROR], $this->response_headers['status']); return FALSE; } } return TRUE; } else { $this->errstr = sprintf($HTTP_ERROR[CURLE_COULDNT_CONNECT], $errstr); return FALSE; } } else { $this->errstr = $HTTP_ERROR[CURLE_URL_MALFORMAT]; return FALSE; } }
function tagOpen(&$parser, $name, $attrs) { global $C; foreach ($attrs as $key => $val) { $attrs[$key] = trim($val); } switch ($name) { case 'a': $href_no_query = preg_replace('~\\?.*$~', '', $attrs['href']); $is_picture_link = preg_match("~\\.({$this->picture_exts})\$~i", $href_no_query); $is_movie_link = preg_match("~\\.({$this->movie_exts})\$~i", $href_no_query); if ($is_picture_link || $is_movie_link) { $this->in_links[] = RelativeToAbsolute($this->base_url, $attrs['href']); } else { $this->num_links++; } break; case 'img': // Images with a small width or height generally aren't the thumbnail if (isset($attrs['height']) && isset($attrs['width']) && ($attrs['height'] < $C['min_thumb_height'] || $attrs['width'] < $C['min_thumb_width'] || $attrs['height'] > $C['max_thumb_height'] || $attrs['width'] > $C['max_thumb_width'])) { break; } $src_no_query = preg_replace('~\\?.*$~', '', $attrs['src']); $is_thumbnail = preg_match("~\\.({$this->link_exts})\$~i", $src_no_query); if ($is_thumbnail) { $imgsrc = RelativeToAbsolute($this->base_url, $attrs['src']); $this->images[] = array('preview' => $imgsrc, 'full' => $imgsrc, 'content' => $imgsrc); } if (count($this->in_links)) { if ($is_thumbnail) { $link = array_pop($this->in_links); $link_no_query = preg_replace('~\\?.*$~', '', $link); $is_picture_link = preg_match("~\\.({$this->picture_exts})\$~i", $link_no_query); $is_movie_link = preg_match("~\\.({$this->movie_exts})\$~i", $link_no_query); $format = $is_picture_link ? FMT_PICTURES : FMT_MOVIES; if (!isset($this->thumbs[$format][$link_no_query])) { $is_picture_link ? $this->num_picture_links++ : $this->num_movie_links++; $this->num_content_links++; } $attrs['src'] = RelativeToAbsolute($this->base_url, $attrs['src']); $this->thumbs[$format][$link_no_query] = array('preview' => $attrs['src'], 'full' => $is_movie_link ? $attrs['src'] : $link, 'content' => $link); } else { $this->num_links++; } } break; case 'base': if (isset($attrs['href']) && preg_match('~^https?://~i', $attrs['href'])) { $this->base_url = $attrs['href']; } break; case 'script': if (isset($attrs['src'])) { $this->scripts[] = RelativeToAbsolute($this->base_url, $attrs['src']); } break; } }
public static function ExtractUrls($base_url, $html) { $video_urls = array(); $thumb_urls = array(); $dom = new DOMDocument(); @$dom->loadHTML($html); // See if a <base> tag is defined, and if so set $base_url $bases = $dom->getElementsByTagName('base'); foreach ($bases as $base) { $href = $base->getAttribute('href'); if (!empty($href) && preg_match('~^https?://~i', $href)) { $base_url = $href; break; } } // Check <a> tags $as = $dom->getElementsByTagName('a'); foreach ($as as $a) { $href = $a->getAttribute('href'); if (!empty($href) && preg_match('~\\.(' . VIDEO_EXTENSIONS . ')(\\?.*)?$~U', $href)) { $video_urls[] = RelativeToAbsolute($base_url, $href); $imgs = $a->getElementsBytagName('img'); foreach ($imgs as $img) { $src = $img->getAttribute('src'); if (preg_match('~\\.' . JPG_EXTENSION . '(\\?.*)?$~U', $src)) { $thumb_urls[] = RelativeToAbsolute($base_url, $src); } } } } // Check <map> tags $maps = $dom->getElementsByTagName('map'); $imgs = $dom->getElementsByTagName('img'); foreach ($maps as $map) { $map_name = strtolower('#' . $map->getAttribute('name')); $areas = $map->getElementsByTagName('area'); foreach ($areas as $area) { $href = $area->getAttribute('href'); $coords = $area->getAttribute('coords'); if (!empty($href) && preg_match('~\\.(' . VIDEO_EXTENSIONS . ')(\\?.*)?$~U', $href)) { $video_urls[] = RelativeToAbsolute($base_url, $href); foreach ($imgs as $img) { $src = $img->getAttribute('src'); $usemap = strtolower($img->getAttribute('usemap')); if (empty($usemap) || $usemap != $map_name) { continue; } if (preg_match('~\\.' . JPG_EXTENSION . '(\\?.*)?$~U', $src)) { $thumb_urls[] = "[{$coords}]" . RelativeToAbsolute($base_url, $src); } } } } } return array(array_unique($thumb_urls), array_unique($video_urls)); }