function extract_dom() { $this->dom_obj = get_htmldom_obj($this->str); foreach ($this->element_arr as $k => $v) { if ($this->dom_obj) { foreach ($this->dom_obj->find($v) as $k2 => $v2) { $key = $k . '_' . $k2; $dom_arr[$key] = $v2->innertext; $this->dom_info_arr[$key] = array('outertext' => $v2->outertext, 'parent' => array('attr' => $v2->parent()->attr, 'outertext' => $v2->parent()->outertext, 'tag_name' => $v2->parent()->tag), 'tag_name' => $v2->tag, 'attr' => $v2->attr); $this->tag_arr[$key] = $v2->tag; } } } $dom_arr = array_map('trim', $dom_arr); $dom_arr = array_filter($dom_arr); return $dom_arr; }
function get_other_info($content, $args) { if (!$content) { return false; } extract($args); if (!$from_get_rules && !$author_get_rules && !$dateline_get_rules) { return false; } $html = get_htmldom_obj($content); if (!$html) { return false; } if ($from_get_rules) { if ($from_get_type == 1) { $re['from'] = dom_get_str($html, $from_get_rules); } else { $re['from'] = str_get_str($content, $from_get_rules, 'data'); } } if ($author_get_rules) { if ($author_get_type == 1) { $re['author'] = dom_get_str($html, $author_get_rules); } else { $re['author'] = str_get_str($content, $author_get_rules, 'data'); } } if ($dateline_get_rules) { if ($dateline_get_type == 1) { $re['article_dateline'] = dom_get_str($html, $dateline_get_rules); unset($div); } else { $re['article_dateline'] = str_get_str($content, $dateline_get_rules, 'data'); } } foreach ((array) $re as $k => $v) { $re[$k] = format_html($v); } $html->clear(); unset($html); return $re; }
function get_content_page($content) { if ($this->p_arr['content_page_get_type'] == 1) { $html = get_htmldom_obj($content); if (!$html) { return false; } foreach ($html->find($this->p_arr['content_page_rules']) as $v) { $a_url = $this->format_url($v->attr['href']); if (!$a_url || $a_url == '#' || $v->innertext == milu_lang('up_page')) { continue; } $item[] = _expandlinks($a_url, $this->base_url); $re_arr = sarray_unique($item); } $html->clear(); unset($html); } else { $re_arr = string_page_link($content, $this->p_arr['content_page_rules'], $this->now_url); //字符串 } return $re_arr; }