예제 #1
0
파일: cnki.php 프로젝트: zyt01/Spider-CNKI
function RunArticleApp($urllist, $from, $times)
{
    $htmlcode = CreateHtml($urllist);
    //echo $htmlcode;
    global $htmldom;
    $article_information = array('title' => '', 'author' => '', 'abstract' => '', 'filename' => '', 'dbcode' => '', 'keywords' => '', 'sid' => '');
    if (($article_information['title'] = GetArticleText($htmlcode, $htmldom["titleid"])) !== null && $times >= 0) {
        $article_information['author'] = GetArticleText($htmlcode, $htmldom["authorclass"]);
        $article_information['abstract'] = html_encode(GetArticleText($htmlcode, $htmldom["abstractid"]));
        $article_information['keywords'] = merge_spaces(html_encode(GetArticleText($htmlcode, $htmldom["keywordid"])));
        $listrullocation = GetArticleElement($htmlcode, $htmldom["zwjdown"]);
        if ($listrullocation !== null) {
            $temprul = htmlspecialchars_decode(GetArticleHref($listrullocation, 'a'));
            if ($temprul !== null && $times >= 0) {
                preg_match('/filename=([^&]+)&dbcode=([^&]+)&/', $temprul, $match);
                $article_information['filename'] = count($match) > 2 ? $match[1] : '';
                $article_information['dbcode'] = count($match) > 2 ? $match[2] : '';
                if (mysql_query('insert ignore into `articles` (`title`, `author`, `abstract`, `keywords`, `filename`, `dbcode`, `type`, `href`, `toname`) values ("' . $article_information['title'] . '", "' . $article_information['author'] . '", "' . $article_information['abstract'] . '", "' . $article_information['keywords'] . '", "' . $article_information['filename'] . '", "' . $article_information['dbcode'] . '", "' . '1' . '", "' . html_encode($urllist) . '", "' . $from . '");')) {
                    echo "<a href=" . $urllist . ">" . $article_information['title'] . "</a> <strong>" . $times . "</strong> \n";
                } else {
                    echo mysql_error();
                }
                if ($article_information['filename'] != '' && $times > 0) {
                    global $url, $reftype;
                    $listv = GetArticleValue($htmlcode, $htmldom["listv"]);
                    $list_url = GetListUrl($url["domain"], $url["kcms"], $url["detail"], $url["frame"], $url["listfile"], $article_information['filename'], $article_information['dbcode'], $reftype["reference"], $listv);
                    unset($listv, $temprul, $listrullocation, $match);
                    RunListApp($list_url, $article_information['filename'], $times - 1);
                    unset($list_url, $listv);
                } else {
                    unset($listrullocation, $temprul, $match);
                }
            } else {
                unset($listrullocation, $temprul);
            }
        } else {
            unset($listrullocation);
        }
    } else {
        if (($article_information['title'] = GetArticleText($htmlcode, $htmldom["entitleid"])) !== null) {
            $article_information['filename'] = GetArticleValue($htmlcode, $htmldom["filenameid"]);
            $article_information['dbcode'] = GetArticleValue($htmlcode, $htmldom["tablenameid"]);
            $article_information['author'] = merge_spaces(merge_ques(html_encode(GetArticleIndexText($htmlcode, $htmldom["strContext"], 0))));
            if (GetArticleElementsNum($htmlcode, $htmldom["strContext"]) == 7) {
                $article_information['abstract'] = merge_spaces(merge_ques(html_encode(GetArticleIndexText($htmlcode, $htmldom["strContext"], 6))));
                $article_information['keywords'] = merge_spaces(merge_ques(html_encode(GetArticleIndexText($htmlcode, $htmldom["strContext"], 5))));
            } else {
                if (GetArticleElementsNum($htmlcode, $htmldom["strContext"]) == 6) {
                    $article_information['abstract'] = merge_spaces(merge_ques(html_encode(GetArticleIndexText($htmlcode, $htmldom["strContext"], 5))));
                }
            }
            if (mysql_query('insert ignore into `articles` (`title`, `author`, `abstract`, `keywords`, `filename`, `dbcode`, `type`, `href`, `toname`) values ("' . $article_information['title'] . '", "' . $article_information['author'] . '", "' . $article_information['abstract'] . '", "' . $article_information['keywords'] . '", "' . $article_information['filename'] . '", "' . $article_information['dbcode'] . '", "' . '2' . '", "' . html_encode($urllist) . '", "' . $from . '");')) {
                echo "<a href=" . $urllist . ">" . $article_information['title'] . "</a> <strong>" . $times . "</strong> \n";
            } else {
                die(mysql_error());
            }
        } else {
            if (preg_match('/title=([^&]+)&sid=([^&]+)&aufirst=([^&]+)/', $urllist, $match)) {
                $article_information['title'] = count($match) > 3 ? html_encode($match[1]) : '';
                $article_information['sid'] = count($match) > 3 ? html_encode($match[2]) : '';
                $article_information['author'] = count($match) > 3 ? html_encode($match[3]) : '';
                if (mysql_query('insert ignore into `articles` (`title`, `author`, `sid`, `type`, `href`, `toname`) values ("' . $article_information['title'] . '", "' . $article_information['author'] . '", "' . $article_information['sid'] . '", "' . '3' . '", "' . html_encode($urllist) . '", "' . $from . '");')) {
                    echo "<a href=" . $urllist . ">" . $article_information['title'] . "</a> <strong>" . $times . "</strong> \n";
                } else {
                    die(mysql_error());
                }
                unset($match);
            } else {
                unset($match);
            }
        }
    }
    // else if(($table = GetArticleElement($htmlcode, $htmldom["entable"])) !== null && $times >= 0) {
    // 	$trs = GetArticleElements($table, 'tr');
    // 	if($trs !== null) {
    // 		if(null !== ($article_information['author'] = $trs[0]->last_child()->plaintext));
    // 		//echo $article_information['author']."<hr>";
    // 		if(null !== ($article_information['title'] = $trs[1]->last_child()->plaintext));
    // 		if(mysql_query('insert into articles (title) values ("'.$article_information['title'].'")'))
    // 			echo "<a href=".$urllist.">".$article_information['title']."</a> <strong>".$times."</strong> \n";
    // 	}
    // 	else {
    // 		unset($table, $trs);
    // 	}
    // }
    // else {
    // 	unset($table);
    // }
    $htmlcode->clear();
    unset($htmlcode, $article_information);
    return 0;
}
예제 #2
0
파일: Search.php 프로젝트: ahmatjan/lv
    public function index()
    {
        //搜索关键词
        if ($this->input->get('query')) {
            $query = mb_substr(merge_spaces(trim($this->input->get('query'))), 0, 30);
            $search['query'] = $query;
        } else {
            $search['query'] = '';
            $query = '';
        }
        $data['query'] = $query;
        //搜索的类型and/or
        if ($this->input->get('type')) {
            $search['type'] = $this->input->get('type');
        } else {
            $search['type'] = 'or';
        }
        //url是否把链接加入搜索权重
        if ($this->input->get('url')) {
            $search['url'] = $this->input->get('url');
        } else {
            $search['url'] = 'all';
        }
        //每页显示多少条结果
        if ($this->input->get('results')) {
            $search['results'] = $this->input->get('results');
        } else {
            $search['results'] = '0';
        }
        $quantity_view = $this->base_setting->get_setting('quantity_view');
        //每页显示数
        $search['quantity_view'] = $quantity_view;
        $spider_all = $this->search_model->get_spider_like($search);
        //搜索关键词
        //处理高亮显示,把字符串转成单字数组
        /*
        $q_arr = SBC_DBC($query,1);//全角转半角
        $q_arr = preg_replace("/\s/","",$q_arr);//去空格
        $q_arr = split_string_to_array($q_arr);
        */
        $q_arr = explode(' ', $query);
        $q_arr_count = count($q_arr);
        //处理高亮显示,把字符串转成单字数组
        //块布局左
        $this->load->module('common/module_left');
        $data['module_left'] = $this->module_left->index();
        //块布局右
        $this->load->module('common/module_right');
        $data['module_right'] = $this->module_right->index();
        //底部
        $this->load->module('common/module_bottom');
        $data['module_bottom'] = $this->module_bottom->index();
        //设定span9的css样式
        if (!empty($data['module_right']) && empty($data['module_left'])) {
            $header['style'] = '
			<style type="text/css">
			@media(min-width: 980px){
				.search .row-fluid .span9{
					padding-right: 150px;
				}
			}
			</style>
			';
        }
        //------------------------------------------------
        $header['title'] = $query;
        $header['css_page_style'] = array('public/css/blog.css', 'public/css/jquery.fancybox.css', 'public/css/jquery.gritter.css');
        $this->public_section->get_header($header);
        $this->public_section->get_top();
        //遍历处理结果数组
        $data['results'] = array();
        foreach ($spider_all['content'] as $s_k => $s_v) {
            //权重
            $spider[$s_k]['weight'] = '';
            //链接
            $spider[$s_k]['url'] = $spider_all['content'][$s_k]['url'];
            //计算权重,统计字符长度
            $title_strlen = strlen($spider_all['content'][$s_k]['title']);
            $content_strlen = strlen($spider_all['content'][$s_k]['content']);
            //计算权重排序,计算相似度
            if (!empty($query) && is_array($q_arr)) {
                foreach ($q_arr as $q_k => $q_v) {
                    //关键字长度/标题长度*相似度
                    //标题
                    @($spider[$s_k]['weight']['title'][$q_k] = strlen($q_arr[$q_k]) / $title_strlen * 0.2 + $this->similarity->getSimilar($q_arr[$q_k], $spider_all['content'][$s_k]['title']) * 0.4);
                    //内容
                    $spider[$s_k]['weight']['content'][$q_k] = strlen($q_arr[$q_k]) / $content_strlen * 0.12 + $this->similarity->getSimilar($q_arr[$q_k], $spider_all['content'][$s_k]['content']) * 0.28;
                }
                //计算元素的和
                $spider[$s_k]['weight'] = array_sum($spider[$s_k]['weight']['title']) + array_sum($spider[$s_k]['weight']['content']);
            }
            //循环关键字,高亮
            //标题
            $title = mb_substr($spider_all['content'][$s_k]['title'], 0, 18, 'utf-8');
            //正文
            $content = mb_substr($spider_all['content'][$s_k]['content'], 0, 100, 'utf-8');
            for ($i = 0; $i < $q_arr_count; $i++) {
                //高亮显示文本字符串
                $title = highlight_phrase($title, $q_arr[$i]);
                $content = highlight_phrase($content, $q_arr[$i]);
            }
            $spider[$s_k]['title'] = $this->public_section->word_censor($title);
            //标题
            $spider[$s_k]['content'] = $this->public_section->word_censor($content);
            //正文
        }
        //按权重分排序
        if (isset($spider)) {
            foreach ($spider as $weight_) {
                $weight_order[] = $weight_['weight'];
            }
            array_multisort($weight_order, SORT_DESC, $spider);
        }
        //arsort($spider);
        @($data['results'] = $spider);
        //分页
        //链接
        $url = '';
        $url .= 'type=' . $search['type'];
        $url .= '&url=' . $search['url'];
        $url .= '&query=' . $query;
        $config['full_tag_open'] = '<ul>';
        $config['full_tag_close'] = '</ul>';
        $config['first_link'] = '首页';
        $config['last_link'] = '尾页';
        $config['first_tag_open'] = '<li>';
        $config['next_tag_open'] = '<li>';
        $config['prev_tag_open'] = '<li>';
        $config['last_tag_open'] = '<li>';
        $config['cur_tag_open'] = '<li class="active"><a>';
        $config['num_tag_open'] = '<li>';
        $config['first_tag_close'] = '</li>';
        $config['next_tag_close'] = '</li>';
        $config['last_tag_close'] = '</li>';
        $config['prev_tag_close'] = '</li>';
        $config['cur_tag_close'] = '</a></li>';
        $config['num_tag_close'] = '</li>';
        $config['next_link'] = '下一页';
        $config['prev_link'] = '上一页';
        $config['base_url'] = site_url('search?' . $url);
        $config['total_rows'] = $spider_all['count'];
        $config['per_page'] = $quantity_view;
        //每页显示条数
        $config['page_query_string'] = TRUE;
        $config['query_string_segment'] = 'results';
        if ($this->agent->is_mobile()) {
            $config1['display_pages'] = FALSE;
        } else {
            $config1['num_links'] = '2';
        }
        $this->pagination->initialize($config);
        $data['count'] = $spider_all['count'];
        $data['search_page'] = $this->pagination->create_links();
        $this->load->view('tools/search', $data);
        $this->public_section->get_footer();
    }