makeDir($indexSavePath); $mapFile = $indexSavePath . "/paper_url_mapping.log"; delFile($mapFile); $icount = 1; while ($line = readLine($fp)) { $arr = explode("\t", $line); $u = $arr[6]; $paperName = $arr[0]; $paperName = win_dir_format($paperName); //echo $paperName . "\n"; $htmlFileName = $indexSavePath . "/" . $paperName . ".html"; $tmpFile = iconv("utf-8", "gb2312//IGNORE", $htmlFileName); $dbCode = get_db_code($u); $fileName = get_file_name($u); $tableName = get_table_name($u); $realUrl = get_real_url($dbCode, $fileName, $tableName); if (file_exists($tmpFile)) { if (filesize($tmpFile) < 100) { delFile($htmlFileName); } else { echo "Cache hit! continue -> {$htmlFileName}\n"; $mapContent = "{$paperName}\t{$realUrl}\n"; save($mapFile, $mapContent, "a+"); continue; } } $indexContent = @file_get_contents($realUrl); if (strlen($indexContent) == 0) { die("{$realUrl}"); continue; }
function get_real_url($url, $echo = 0, $level = 0) { if ($level > 5) { return $url; } list($host, $path, $filename, $ext, $proto, $port) = url_split($url); // опознаётся два протокола: http и https if (!$port) { $port = $proto == 'https' ? 443 : 80; } $fp = fsockopen($host, $port, $errno, $errstr, 10); if (!$fp) { echo "{$errstr} ({$errno})\n"; $answer = $url; } else { // GET, HEAD, OPTIONS, TRACE $req = $ext ? "/{$path}/{$filename}.{$ext}" : "/{$path}/{$filename}"; fputs($fp, "HEAD {$req} HTTP/1.0\r\nHost: {$host}\r\n\r\n"); $head = ''; while (!feof($fp)) { $head .= fgets($fp, 128); } fclose($fp); if ($echo) { echo "{$level}:\n{$head}"; } if (preg_match('/^Location: *http:\\/\\/([^\\r\\n]*)/m', $head, $matches)) { $level++; $answer = get_real_url('http://' . $matches[1], $echo, $level); } elseif (preg_match('/^Location: *([^\\r\\n]*)/m', $head, $matches)) { $level++; $answer = get_real_url("http://{$host}/{$path}/{$matches[1]}", $echo, $level); } else { $answer = $url; } } // if return $answer; }
} // if } // if ////////////////////////////////////////////////// } else { echo 'пуст!'; } break; ////////////////////////////////////////////////// ////////////////////////////////////////////////// case 'head': // Получить header из $params ////////////////////////////////////////////////// echo "\n"; $str = get_real_url($params, 1); if ($str != $params) { echo "Адрес: \"{$str}\""; } break; ////////////////////////////////////////////////// ////////////////////////////////////////////////// default: // Если не команда - объявляем переменную ////////////////////////////////////////////////// switch ($cmd) { case 'compress': case 'level': case 'subject': echo ' - объявляем переменную'; ${$cmd} = $params;