コード例 #1
0
function main($subDir, $class, $cookieURL, $indexURL, $totalClass, $curClass, $code)
{
    $isSleep = true;
    makeDir("./html/{$subDir}/{$class}/");
    $dataFileName = "data/{$subDir}/{$class}.log";
    $httpClient = new HttpClient("epub.cnki.net");
    $content = "";
    $indexFname = "./html/{$subDir}/{$class}/index.html";
    $tf = iconv("utf-8", "gb2312", $indexFname);
    $cookies = "";
    if (file_exists($tf)) {
        $isSleep = false;
        $content = file_get_contents($tf);
        echo "From cache get index.....\n";
    } else {
        /*获取并设置cookie*/
        $httpClient->get($cookieURL);
        $cookies = $httpClient->getCookies();
        $httpClient->setCookies($cookies);
        if (!$cookies) {
            die("cookie error");
        }
        $isSleep = true;
        $httpClient->get($indexURL);
        $content = $httpClient->getContent();
        save($indexFname, $content);
        //保存
        echo "save index file...\n";
    }
    /* 解析出一共有多少页面 */
    $pageCount = parsePageCount($content);
    echo "Page is {$pageCount} ****\n";
    $articleCount = ARTICLE_PRE_PAGE * $pageCount;
    //计算一共有多少篇文章,大于等于实际文章数目,不影响结果
    echo "total article is {$articleCount}\n";
    $pageCount = $articleCount / ARTICLE_PRE_PAGE;
    $pageCount = ceil($pageCount);
    //向上取整,不放过任何数据
    if ($pageCount == 0) {
        $pageCount = 1;
    }
    if ($pageCount > 50) {
        echo "page count is big than 50\n";
    }
    echo "total page of {$class} is : {$pageCount}...............{$curClass} of {$totalClass}\n";
    if ($isSleep) {
        fakeSleep();
    }
    /* 抓取每一个页面并且保存下来,保存的同时进行解析 */
    for ($i = 1; $i <= $pageCount; $i++) {
        $content = NULL;
        $pageI = getPageI($indexURL, $i);
        //第i页的地址
        $htmlI = "./html/{$subDir}/{$class}/{$i}.html";
        if (!file_exists(iconv("utf-8", "gb2312", $htmlI))) {
            $isSleep = true;
            $httpClient->setCookies($cookies);
            $httpClient->get($pageI);
            $content = $httpClient->getContent();
            save($htmlI, $content);
            echo "From newwork & save {$i}.html..........[{$i} of {$pageCount}]\n";
        } else {
            $tmpf2 = iconv("utf-8", "gb2312", $htmlI);
            $content = file_get_contents($tmpf2);
            $ok = validatePageContent($content);
            //是否出现了验证码
            if (!$ok) {
                $i = $i - 1;
                delFile($htmlI);
            } else {
                $isSleep = false;
                echo "Find local file {$htmlI} & skip\n";
            }
            //continue;
        }
        $logName = "./data/{$subDir}/{$class}.log";
        if (!validatePageContent($content)) {
            $i = $i - 1;
            delFile($htmlI);
            dosleep(60);
            $httpClient = new HttpClient("epub.cnki.net");
            $httpClient->get($cookieURL);
            $cookies = $httpClient->getCookies();
            $httpClient->setCookies($cookies);
            continue;
        }
        parseContent($content, $logName, $code);
        if ($i != $pageCount && $isSleep) {
            fakeSleep();
        } else {
            echo "+\n";
            echo "+\n";
            echo "+ {$class} done\n";
            echo "+\n";
            echo "+\n";
        }
    }
}
コード例 #2
0
         $httpClient->get($contentUrl);
         $content = $httpClient->getContent();
         //302页面
         /*解析地址*/
         $contentUrl = get_content_url($content);
         echo $contentUrl . "\n";
         $saveContent = $paperName . "\t" . $contentUrl . "\n";
         save($mapFile, $saveContent, "a+");
         //echo "save $saveContent\n";
         /*抓取论文摘要内容*/
         $content = $httpClient->quickGet($contentUrl);
         $contentSize = strlen($content);
         if ($contentSize > 300) {
             save($cachedHtml, $content);
         } else {
             fakeSleep();
         }
     } while ($contentSize < 300);
 } else {
     $sleep = false;
     echo "Hit\n";
     $content = file_get_contents($localedCachedHtml);
     if (strlen($content) < 300) {
         delFile($cachedHtml);
         echo "Empty abstract file \n";
     }
     continue;
 }
 $keyWords = get_key_words($content);
 $keyWords .= "#" . get_mentor($content) . "#" . get_major($content);
 $abs = get_paper_abs($content);