function get_single_article($content, $url, $args = array()) { global $_G; extract($args); if (strlen(trim($content)) < 1) { return; } d_s('evo'); $get_type = $_GET['get_type'] ? intval($_GET['get_type']) : $get_type; $get_type = $get_type ? $get_type : 1; $milu_set = pick_common_get(); $rules_info = match_rules($url, $content, $get_type, 0); if (is_array($rules_info)) { pload('F:fastpick'); $data = rules_get_article($content, $rules_info); write_evo_errlog($data, $url, $rules_info); } else { $data = (array) cloud_match_rules($get_type, $url, $content); //从云端下载规则 这里应该做点优化,暂时没想到方法。 if (!$data['content'] && $milu_set['fp_open_auto'] == 1) { //开启智能获取 pload('C:HtmlExtractor'); pload('F:article'); $he = new HtmlExtractor($content, $url); $data = (array) $he->get_text(); $data['content'] = dz_attach_format($url, $data['content']); $arr = format_article_imgurl($url, $data['content']); $data['content'] = $arr['message']; $del_dom_rules = array('div[id*=share]', 'div[class*=page]'); foreach ($del_dom_rules as $k => $v) { $data['content'] = dom_filter_something($data['content'], $v, 2); } unset($data['evo_title_info']); } } if ($_GET['type'] == 'bbs') { $data['content'] = media_htmlbbcode($data['content'], $url); $data['content'] = img_htmlbbcode($data['content'], $url); } $data['evo_time'] = d_e(0, 'evo'); return $data; }
public function testExtract() { $extractor = new HtmlExtractor(); static::assertEquals(['mary', 'is', 'very', 'tall', 'she', 'was', 'in', 'the', '9th', 'grade'], $extractor->extract('<html><head><title>Mary is very tall.</title></head><body>She was in the 9th grade.</body></html>')); }