function Analyser() { $cAtt = new DedeAttribute2(); $cAtt->IsTagName = false; $c = ""; $i = 0; $startPos = 0; $endPos = 0; $wt = 0; $ht = 0; $scriptdd = 0; $attStr = ""; $tmpValue = ""; $tmpValue2 = ""; $tagName = ""; $hashead = 0; $slen = strlen($this->SourceHtml); if($this->GetLinkType=="link") { $needTag = "a|meta|title|/head|body"; } else if($this->GetLinkType=="media") { $needTag = "img|embed|a"; $this->IsHead = true; } else { $needTag = "img|embed|a|meta|title|/head|body"; } for(;$i < $slen; $i++) { $c = $this->SourceHtml[$i]; if($c=="<") { //这种情况一般是用于采集程序的模式 $tagName = ""; $j = 0; for($i=$i+1; $i < $slen; $i++){ if($j>10) break; $j++; if(!ereg("[ <>\r\n\t]",$this->SourceHtml[$i])) { $tagName .= $this->SourceHtml[$i]; } else{ break; } } $tagName = strtolower($tagName); if($tagName=="!--"){ $endPos = strpos($this->SourceHtml,"-->",$i); if($endPos!==false) $i=$endPos+3; continue; } if(ereg($needTag,$tagName)){ $startPos = $i; $endPos = strpos($this->SourceHtml,">",$i+1); if($endPos===false) break; $attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1); $cAtt->SetSource($attStr); }else{ continue; } //检测HTML头信息 if(!$this->IsHead) { if($tagName=="meta"){ //分析name属性 $tmpValue = strtolower($cAtt->GetAtt("http-equiv")); if($tmpValue=="content-type"){ $this->CharSet = strtolower($cAtt->GetAtt("charset")); } } //End meta 分析 else if($tagName=="title"){ $this->Title = $this->GetInnerText($i,"title"); $i += strlen($this->Title)+12; } else if($tagName=="/head"||$tagName=="body"){ $this->IsHead = true; $i = $i+5; } } else { //小型分析的数据 //只获得内容里的多媒体资源链接,不获取text if($tagName=="img"){ //获取图片中的网址 $this->InsertMedia($cAtt->GetAtt("src"),"img"); } else if($tagName=="embed"){ //获得Flash或其它媒体的内容 $rurl = $this->InsertMedia($cAtt->GetAtt("src"),"embed"); if($rurl != ""){ $this->MediaInfos[$rurl][0] = $cAtt->GetAtt("width"); $this->MediaInfos[$rurl][1] = $cAtt->GetAtt("height"); } } else if($tagName=="a"){ //获得Flash或其它媒体的内容 $this->InsertLink($cAtt->GetAtt("href"),$this->GetInnerText($i,"a")); } }//结束解析body的内容 }//End if char }//End for if($this->Title=="") $this->Title = $this->BaseUrl; }
/** * 解析HTML * * @access private * @return void */ function Analyser() { $cAtt = new DedeAttribute2(); $cAtt->IsTagName = false; $c = ''; $i = 0; $startPos = 0; $endPos = 0; $wt = 0; $ht = 0; $scriptdd = 0; $attStr = ''; $tmpValue = ''; $tmpValue2 = ''; $tagName = ''; $hashead = 0; $slen = strlen($this->SourceHtml); if ($this->GetLinkType == 'link' || $this->GetLinkType == '') { $needTags = array('a'); } if ($this->GetLinkType == 'media') { $needTags = array('img', 'embed', 'a'); $this->IsHead = true; } $tagbreaks = array(' ', '<', '>', "\r", "\n", "\t"); for (; isset($this->SourceHtml[$i]); $i++) { if ($this->SourceHtml[$i] == '<') { $tagName = ''; $j = 0; for ($i = $i + 1; isset($this->SourceHtml[$i]); $i++) { if ($j > 10) { break; } $j++; if (in_array($this->SourceHtml[$i], $tagbreaks)) { break; } else { $tagName .= $this->SourceHtml[$i]; } } $tagName = strtolower($tagName); //标记为注解 if ($tagName == '!--') { $endPos = strpos($this->SourceHtml, '-->', $i); if ($endPos !== false) { $i = $endPos + 3; } continue; } else { if (in_array($tagName, $needTags)) { $startPos = $i; $endPos = strpos($this->SourceHtml, '>', $i + 1); if ($endPos === false) { break; } $attStr = substr($this->SourceHtml, $i + 1, $endPos - $startPos - 1); $cAtt->SetSource($attStr); if ($tagName == 'img') { $this->InsertMedia($cAtt->GetAtt('src'), 'img'); } else { if ($tagName == 'embed') { $rurl = $this->InsertMedia($cAtt->GetAtt('src'), 'embed'); if ($rurl != '') { $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width'); $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height'); } } else { if ($tagName == 'a') { $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')), $this->GetInnerText($i, 'a')); } } } } else { continue; } } $i--; } //End if char } //End for if ($this->Title == '') { $this->Title = $this->BaseUrl; } }