function snatchGetList($record, &$info)
{
    global $db, $setting, $req;
    $idx = $req->getCookie("ns_idx");
    if (empty($idx)) {
        $idx = 0;
    }
    for ($i = $idx, $m = count($info['newList']); $i < $m; $i++) {
        if (isset($info['para']['pre_max']) && $i - $idx >= $info['para']['pre_max']) {
            break;
        }
        $record['subject'] = $info['newList'][$i][1];
        $record['original'] = "搜狐网";
        $record['url'] = $info['newList'][$i][2];
        $record['add_date'] = date("Y") . "/" . $info['newList'][$i][3];
        $record['item_2'] = $info['newList'][$i][4];
        if (strpos($record['url'], ".sohu.com") === false) {
            continue;
        }
        if ($content = GetRemoteContent($record['url'], $info['header'])) {
            if (preg_match("/来源:<span.+?>(.+?)<\\/span>/i", $content, $matches)) {
                $record['original'] = strip_tags($matches[1]);
                unset($matches);
            }
            if (preg_match("/<meta name=\"keywords\" content=\"(.+?)\">/i", $content, $matches)) {
                $record['item_3'] = str_replace(" ", ",", $matches[1]);
                unset($matches);
            }
            if (preg_match("/<meta name=\"description\" content=\"(.+?)\">/i", $content, $matches)) {
                $record['item_4'] = str_replace(" ", ",", $matches[1]);
                $record['item_4'] = substrPro($record['item_4'], 0, 230);
                unset($matches);
            }
            $flag = false;
            if (preg_match("/<\\!\\-\\- 正文 st \\-\\->[\r\n\\s]+<div.+?>(.+?)<\\/div>[\r\n\\s]+<\\!\\-\\- 正文 end \\-\\->/is", $content, $matches)) {
                $record['content'] = $matches[1];
                $record['content'] = preg_replace("/<div class\\=\"tagIntg.+?<\\/div>/is", "", $record['content']);
                $record['content'] = preg_replace("/<div class\\=\"tagHotg.+?<\\/div>/is", "", $record['content']);
                $record['content'] = preg_replace("/<div class\\=\"editer.+?<\\/div>/is", "", $record['content']);
                unset($matches);
                $flag = true;
            } elseif (preg_match("/<div class\\=\"textcont\" id\\=\"textcont\">(.+?)<\\/div>/is", $content, $matches)) {
                $cur_content = array();
                $cur_content[0] = $matches[1];
                $cur_content[0] = preg_replace("/<p class\\=\"editUsr.+?<\\/p>/is", "", $cur_content[0]);
                $cur_content[0] = preg_replace("/<p>.+?<p>/is", "<p>", $cur_content[0]);
                $cur_content[0] = preg_replace("/<\\/p>[\\s\r\n]+?<\\/p>/is", "</p>", $cur_content[0]);
                $cur_content[0] = preg_replace("/^[\r\n\\s]+/is", "", $cur_content[0]);
                $cur_content[0] = preg_replace("/[\r\n\\s]+\$/is", "", $cur_content[0]);
                unset($matches);
                if (preg_match("/<img id\\=\"slide_pic\" src\\=\"(.+?)\" alt\\=\"(.+?)\".*?>/is", $content, $matches)) {
                    $cur_content[0] = "<p>" . $matches[0] . "</p>\n" . $cur_content[0];
                }
                unset($matches);
                if (preg_match("/<span id\\=\"pageNum\">1\\/(\\d+)<\\/span>/is", $content, $matches)) {
                    $pages = $matches[1];
                    unset($matches);
                    for ($n = 1; $n < $pages; $n++) {
                        $cur_url = preg_replace("/(\\.\\w+)\$/i", "_" . $n . "\\1", $record['url']);
                        if ($page_content = GetRemoteContent($cur_url, $info['header'])) {
                            if (preg_match("/<div class\\=\"textcont\" id\\=\"textcont\">(.+?)<\\/div>/is", $page_content, $matches)) {
                                $cur_content[$n] = $matches[1];
                                $cur_content[$n] = preg_replace("/<p class\\=\"editUsr.+?<\\/p>/is", "", $cur_content[$n]);
                                $cur_content[$n] = preg_replace("/<p>.+?<p>/is", "<p>", $cur_content[$n]);
                                $cur_content[$n] = preg_replace("/<\\/p>[\\s\r\n]+?<\\/p>/is", "</p>", $cur_content[$n]);
                                $cur_content[$n] = preg_replace("/^[\r\n\\s]+/is", "", $cur_content[$n]);
                                $cur_content[$n] = preg_replace("/[\r\n\\s]+\$/is", "", $cur_content[$n]);
                            }
                            unset($matches);
                            if (preg_match("/<img id\\=\"slide_pic\" src\\=\"(.+?)\" alt\\=\"(.+?)\".*?>/is", $page_content, $matches)) {
                                $cur_content[$n] = "<p>" . $matches[0] . "</p>\n" . $cur_content[$n];
                            }
                            unset($matches);
                        }
                    }
                }
                $record['content'] = implode("<!-- pagebreak -->", $cur_content);
                $flag = true;
            } elseif (preg_match("/<div.+?id\\=\"contentText\">(.+?)<\\/div>/is", $content, $matches)) {
                $record['content'] = $matches[1];
                $record['content'] = preg_replace("/<div class\\=\"editer.+?<\\/div>/is", "", $record['content']);
                $record['content'] = preg_replace("/<p>.+?<p>/is", "<p>", $record['content']);
                $record['content'] = preg_replace("/<\\/p>[\\s\r\n]+?<\\/p>/is", "</p>", $record['content']);
                $record['content'] = preg_replace("/^[\r\n\\s]+/is", "", $record['content']);
                $record['content'] = preg_replace("/[\r\n\\s]+\$/is", "", $record['content']);
                unset($matches);
                $flag = true;
            } elseif (preg_match("/<div id\\=\"news_c\".+?>(.+?)<div id\\=\"news_s\"/is", $content, $matches)) {
                $record['content'] = $matches[1];
                unset($matches);
                $flag = true;
            } else {
                snatch_log('<div class="item">' . $info['counter']++ . ' - <a href="' . $record['url'] . '" target="_blank">' . $record['subject'] . '</a> 获取<span class="failed" style="color:red;">失败!</span></div>');
            }
            if ($flag) {
                if ($db->record($setting['db']['pre'] . "news_snatch", "id", array("url", "=", $record['url'])) === false) {
                    $record['content'] = preg_replace("/<script.+?<\\/script>/is", "", $record['content']);
                    $record['content'] = preg_replace("/<style.+?<\\/style>/is", "", $record['content']);
                    $record['content'] = preg_replace("/<form.+?<\\/form>/is", "", $record['content']);
                    $record['content'] = preg_replace("/<iframe.+?<\\/iframe>/is", "", $record['content']);
                    $record['content'] = preg_replace("/^[\r\n\\s]+/is", "", $record['content']);
                    $record['content'] = preg_replace("/[\r\n\\s]+\$/is", "", $record['content']);
                    $record['content'] = preg_replace("/延伸阅读.+\$/", "", $record['content']);
                    $record['content'] = preg_replace("/<DIV class\\=\"tvsubject.+\$/", "", $record['content']);
                    $record['content'] = str_replace("微博推荐", "", $record['content']);
                    $record['content'] = str_replace("我来纠错", "", $record['content']);
                    $record['content'] = str_replace('<div class="line"></div>', "", $record['content']);
                    $record['content'] = preg_replace("/<div class\\=\"stockTrends.+?<\\/div>/s", "", $record['content']);
                    $record['content'] = preg_replace("/<div class\\=\"shareIn.+?<\\/div>/s", "", $record['content']);
                    $record['content'] = preg_replace("/[\r\n]+<div class\\=\"muLink.+?<\\/div>[\r\n]+/", "", $record['content']);
                    $record['content'] = preg_replace("/<DIV class\\=\"tvsubject.+\$/s", "", $record['content']);
                    if (preg_match("/<img.+?src=(.?)(http.+?)\\1.+?>/is", $record['content'], $matches)) {
                        $record['item_5'] = $matches[2];
                    } else {
                        $record['item_5'] = "";
                    }
                    if ($record['item_5'] == "http://images.sohu.com/ccc.gif" || $record['item_5'] == "http://photo.sohu.com/20040809/Img221437781.gif" || $record["item_5"] == "http://photocdn.sohu.com/20090828/dot.gif") {
                        $record['item_5'] = "";
                    }
                    snatch_log('<div class="item">' . $info['counter']++ . ' - <a href="' . $record['url'] . '" target="_blank">' . $record['subject'] . '</a> 获取<span class="succeed" style="color:green;">成功!</span></div>');
                    $db->insert($setting['db']['pre'] . "news_snatch", $record);
                } else {
                    snatch_log('<div class="item">' . $info['counter']++ . ' - <a href="' . $record['url'] . '" target="_blank">' . $record['subject'] . '</a> <span class="duplicate" style="color:black;">已存在!</span></div>');
                }
            }
        } else {
            snatch_log('<div class="item">' . $info['counter']++ . ' - <a href="' . $record['url'] . '" target="_blank">' . $record['subject'] . '</a> 获取<span class="failed" style="color:red;">失败!</span></div>');
        }
        $req->setCookie("ns_idx", $i, 86400);
    }
    if ($i >= $m) {
        $req->setCookie("ns_idx");
    }
    return true;
}
Example #2
0
             if ($info["page"] > $info["page_max"]) {
                 break;
             }
             snatch_log('<div class="page" style="font-size:16px;font-weight:bold;">' . sprintf($setting['language']['plugin_news_snatch_info_snatching'], $info["page"]) . '</div>');
             if (snatchGetList($record, $info)) {
                 snatch_log('<div class="succeed" style="color:green;">' . sprintf($setting['language']['plugin_news_snatch_info_snatch_list'], $info["page"], $info['page_count']) . '</div>');
             } else {
                 snatch_log('<div class="failed" style="color:red;">' . $setting['language']['plugin_news_snatch_info_snatch_failed'] . '</div>');
             }
             snatch_log('<div class="split">-------------------------------</div>');
         }
         snatch_log('<div class="page">' . sprintf($setting['language']['plugin_news_snatch_info_snatch_page'], $info["page"] - 1) . '</div>');
     } else {
         snatch_log('<div class="page">' . $setting['language']['plugin_news_snatch_info_snatch_error'] . '</div>');
     }
     snatch_log('<div class="page">' . date("Y-m-d H:i:s") . '</div>');
     $goto_url = $setting['info']['self'];
     break;
 case "news_import":
     ignore_user_abort("on");
     set_time_limit(0);
     //$log_info = $setting['language']['plugin_news_snatch_import'];
     $news_show = array();
     $news_show['news_id'] = 0;
     $news_show['cat_id'] = 1;
     $news_show['web_id'] = 1;
     $news_show['subject'] = "";
     $news_show['style'] = "";
     $news_show['views'] = 0;
     $news_show['describe'] = "";
     $news_show['original'] = "";