function main($subDir, $class, $cookieURL, $indexURL, $totalClass, $curClass, $code) { $isSleep = true; makeDir("./html/{$subDir}/{$class}/"); $dataFileName = "data/{$subDir}/{$class}.log"; $httpClient = new HttpClient("epub.cnki.net"); $content = ""; $indexFname = "./html/{$subDir}/{$class}/index.html"; $tf = iconv("utf-8", "gb2312", $indexFname); $cookies = ""; if (file_exists($tf)) { $isSleep = false; $content = file_get_contents($tf); echo "From cache get index.....\n"; } else { /*获取并设置cookie*/ $httpClient->get($cookieURL); $cookies = $httpClient->getCookies(); $httpClient->setCookies($cookies); if (!$cookies) { die("cookie error"); } $isSleep = true; $httpClient->get($indexURL); $content = $httpClient->getContent(); save($indexFname, $content); //保存 echo "save index file...\n"; } /* 解析出一共有多少页面 */ $pageCount = parsePageCount($content); echo "Page is {$pageCount} ****\n"; $articleCount = ARTICLE_PRE_PAGE * $pageCount; //计算一共有多少篇文章,大于等于实际文章数目,不影响结果 echo "total article is {$articleCount}\n"; $pageCount = $articleCount / ARTICLE_PRE_PAGE; $pageCount = ceil($pageCount); //向上取整,不放过任何数据 if ($pageCount == 0) { $pageCount = 1; } if ($pageCount > 50) { echo "page count is big than 50\n"; } echo "total page of {$class} is : {$pageCount}...............{$curClass} of {$totalClass}\n"; if ($isSleep) { fakeSleep(); } /* 抓取每一个页面并且保存下来,保存的同时进行解析 */ for ($i = 1; $i <= $pageCount; $i++) { $content = NULL; $pageI = getPageI($indexURL, $i); //第i页的地址 $htmlI = "./html/{$subDir}/{$class}/{$i}.html"; if (!file_exists(iconv("utf-8", "gb2312", $htmlI))) { $isSleep = true; $httpClient->setCookies($cookies); $httpClient->get($pageI); $content = $httpClient->getContent(); save($htmlI, $content); echo "From newwork & save {$i}.html..........[{$i} of {$pageCount}]\n"; } else { $tmpf2 = iconv("utf-8", "gb2312", $htmlI); $content = file_get_contents($tmpf2); $ok = validatePageContent($content); //是否出现了验证码 if (!$ok) { $i = $i - 1; delFile($htmlI); } else { $isSleep = false; echo "Find local file {$htmlI} & skip\n"; } //continue; } $logName = "./data/{$subDir}/{$class}.log"; if (!validatePageContent($content)) { $i = $i - 1; delFile($htmlI); dosleep(60); $httpClient = new HttpClient("epub.cnki.net"); $httpClient->get($cookieURL); $cookies = $httpClient->getCookies(); $httpClient->setCookies($cookies); continue; } parseContent($content, $logName, $code); if ($i != $pageCount && $isSleep) { fakeSleep(); } else { echo "+\n"; echo "+\n"; echo "+ {$class} done\n"; echo "+\n"; echo "+\n"; } } }
$httpClient->get($contentUrl); $content = $httpClient->getContent(); //302页面 /*解析地址*/ $contentUrl = get_content_url($content); echo $contentUrl . "\n"; $saveContent = $paperName . "\t" . $contentUrl . "\n"; save($mapFile, $saveContent, "a+"); //echo "save $saveContent\n"; /*抓取论文摘要内容*/ $content = $httpClient->quickGet($contentUrl); $contentSize = strlen($content); if ($contentSize > 300) { save($cachedHtml, $content); } else { fakeSleep(); } } while ($contentSize < 300); } else { $sleep = false; echo "Hit\n"; $content = file_get_contents($localedCachedHtml); if (strlen($content) < 300) { delFile($cachedHtml); echo "Empty abstract file \n"; } continue; } $keyWords = get_key_words($content); $keyWords .= "#" . get_mentor($content) . "#" . get_major($content); $abs = get_paper_abs($content);