makeDir("./data/index/{$key}"); foreach ($files as $file) { $fp = fopen($file, "r"); $file = iconv("gb2312", "utf-8", $file); $subdir = basename($file, ".log"); //$subdir = win_dir_format($subdir); $indexSavePath = "./data/index/{$key}/" . $subdir; makeDir($indexSavePath); $mapFile = $indexSavePath . "/paper_url_mapping.log"; delFile($mapFile); $icount = 1; while ($line = readLine($fp)) { $arr = explode("\t", $line); $u = $arr[6]; $paperName = $arr[0]; $paperName = win_dir_format($paperName); //echo $paperName . "\n"; $htmlFileName = $indexSavePath . "/" . $paperName . ".html"; $tmpFile = iconv("utf-8", "gb2312//IGNORE", $htmlFileName); $dbCode = get_db_code($u); $fileName = get_file_name($u); $tableName = get_table_name($u); $realUrl = get_real_url($dbCode, $fileName, $tableName); if (file_exists($tmpFile)) { if (filesize($tmpFile) < 100) { delFile($htmlFileName); } else { echo "Cache hit! continue -> {$htmlFileName}\n"; $mapContent = "{$paperName}\t{$realUrl}\n"; save($mapFile, $mapContent, "a+"); continue;
function parseContent($content, $fileName, $code) { //save("./tmp.html", $content); echo "parseContent : {$fileName} >> "; /* 文章名字,作者,学位授予单位,来源数据库,学位授予年度,下载次数,预览地址 */ $articleName = parseArticleName($content); $authors = parseAuthor($content); $schools = parseSchool($content); $origin = parseOrigin($content); $years = parseYear($content); //var_dump($origin);exit; //$downCount = parseDownCount($content); $previewPage = parsePreviewURL($content); $abstractUrl = parseAbstractUrl($content); //echo count($articleName) . " >> " . count($authors) . " >> " .count($schools) . " >> " .count($origin) . " >> " .count($years) . " \n"; $saveContent = ""; $len = count($articleName); for ($i = 0; $i < $len; $i++) { $articleNm = win_dir_format($articleName[$i]); $item = "{$articleNm}\t{$authors[$i]}\t{$schools[$i]}\t{$origin[$i]}\t{$years[$i]}\t{$previewPage[$i]}\t{$abstractUrl[$i]}\t{$code}"; $saveContent .= "{$item}\n"; } if ($len == 0) { echo "Done... but get nothing form {$fileName}\n"; return; } save($fileName, $saveContent, "a+"); echo "Done!\n"; }
$key = $argv[1]; if (!$key) { echo "usage \$php abstract.php 'A', 'B' ...\n"; exit; } $files = get_all_log_file("./data/{$key}/"); makeDir("./data/abstract/"); //存放论文摘要,不会重复创建 makeDir("./data/abstract/{$key}"); //’A' , 'B'... $httpClient = new HttpClient("epub.cnki.net"); foreach ($files as $file) { $fp = fopen($file, "r"); $file = iconv("gb2312", "utf-8", $file); $subdir = basename($file, ".log"); $subdir = win_dir_format($subdir); $dataSavePath = "./data/abstract/{$key}/" . $subdir; makeDir($dataSavePath); makeDir($dataSavePath . "/tmp"); $mapFile = $dataSavePath . "/paper_abstract_url.log"; $icount = 1; while ($line = readLine($fp)) { $sleep = true; $arr = explode("\t", $line); $u = $arr[6]; $paperName = $arr[0]; $code = $arr[7]; /*获取Referer头*/ $dbCode = get_db_code($u); //CDFD $refUrl = get_ref($dbCode);