예제 #1
0
	function Analyser()
	{
		$cAtt = new DedeAttribute2();
		$cAtt->IsTagName = false;
		$c = "";
		$i = 0;
		$startPos = 0;
		$endPos = 0;
		$wt = 0;
		$ht = 0;
		$scriptdd = 0;
		$attStr = "";
		$tmpValue = "";
		$tmpValue2 = "";
		$tagName = "";
		$hashead = 0;
		$slen = strlen($this->SourceHtml);
		
		if($this->GetLinkType=="link")
		{ $needTag = "a|meta|title|/head|body"; }
		else if($this->GetLinkType=="media")
		{ $needTag = "img|embed|a"; $this->IsHead = true; }
		else
		{ $needTag = "img|embed|a|meta|title|/head|body"; }
		
		for(;$i < $slen; $i++)
		{
			$c = $this->SourceHtml[$i];
			if($c=="<")
			{
				//这种情况一般是用于采集程序的模式
				$tagName = "";
				$j = 0;
				for($i=$i+1; $i < $slen; $i++){
					if($j>10) break;
					$j++;
					if(!ereg("[ <>\r\n\t]",$this->SourceHtml[$i]))
					{ $tagName .= $this->SourceHtml[$i]; }
					else{ break; }
				}
				$tagName = strtolower($tagName);
				if($tagName=="!--"){
					$endPos = strpos($this->SourceHtml,"-->",$i);
					if($endPos!==false) $i=$endPos+3;
					continue;
				}
				if(ereg($needTag,$tagName)){
					$startPos = $i;
					$endPos = strpos($this->SourceHtml,">",$i+1);
					if($endPos===false) break;
					$attStr = substr($this->SourceHtml,$i+1,$endPos-$startPos-1);
					$cAtt->SetSource($attStr);
				}else{
					continue;
				}
				//检测HTML头信息
				if(!$this->IsHead)
				{
					if($tagName=="meta"){
					  //分析name属性
					  $tmpValue = strtolower($cAtt->GetAtt("http-equiv"));
					  if($tmpValue=="content-type"){
							  $this->CharSet = strtolower($cAtt->GetAtt("charset"));
						}
				  } //End meta 分析
				  else if($tagName=="title"){
						$this->Title = $this->GetInnerText($i,"title");
						$i += strlen($this->Title)+12;
					}
				  else if($tagName=="/head"||$tagName=="body"){
				  	$this->IsHead = true;
				  	$i = $i+5;
					}
			  }
			  else
			  {
					//小型分析的数据
					//只获得内容里的多媒体资源链接,不获取text
					if($tagName=="img"){ //获取图片中的网址
						$this->InsertMedia($cAtt->GetAtt("src"),"img"); 
					}
					else if($tagName=="embed"){ //获得Flash或其它媒体的内容
						$rurl = $this->InsertMedia($cAtt->GetAtt("src"),"embed");
						if($rurl != ""){
						  $this->MediaInfos[$rurl][0] = $cAtt->GetAtt("width");
						  $this->MediaInfos[$rurl][1] = $cAtt->GetAtt("height");
						}
					}
					else if($tagName=="a"){ //获得Flash或其它媒体的内容
						$this->InsertLink($cAtt->GetAtt("href"),$this->GetInnerText($i,"a"));
					}
				}//结束解析body的内容
			}//End if char
		}//End for
		if($this->Title=="") $this->Title = $this->BaseUrl;
	}
예제 #2
0
 /**
  *  解析HTML
  *
  * @access    private
  * @return    void
  */
 function Analyser()
 {
     $cAtt = new DedeAttribute2();
     $cAtt->IsTagName = false;
     $c = '';
     $i = 0;
     $startPos = 0;
     $endPos = 0;
     $wt = 0;
     $ht = 0;
     $scriptdd = 0;
     $attStr = '';
     $tmpValue = '';
     $tmpValue2 = '';
     $tagName = '';
     $hashead = 0;
     $slen = strlen($this->SourceHtml);
     if ($this->GetLinkType == 'link' || $this->GetLinkType == '') {
         $needTags = array('a');
     }
     if ($this->GetLinkType == 'media') {
         $needTags = array('img', 'embed', 'a');
         $this->IsHead = true;
     }
     $tagbreaks = array(' ', '<', '>', "\r", "\n", "\t");
     for (; isset($this->SourceHtml[$i]); $i++) {
         if ($this->SourceHtml[$i] == '<') {
             $tagName = '';
             $j = 0;
             for ($i = $i + 1; isset($this->SourceHtml[$i]); $i++) {
                 if ($j > 10) {
                     break;
                 }
                 $j++;
                 if (in_array($this->SourceHtml[$i], $tagbreaks)) {
                     break;
                 } else {
                     $tagName .= $this->SourceHtml[$i];
                 }
             }
             $tagName = strtolower($tagName);
             //标记为注解
             if ($tagName == '!--') {
                 $endPos = strpos($this->SourceHtml, '-->', $i);
                 if ($endPos !== false) {
                     $i = $endPos + 3;
                 }
                 continue;
             } else {
                 if (in_array($tagName, $needTags)) {
                     $startPos = $i;
                     $endPos = strpos($this->SourceHtml, '>', $i + 1);
                     if ($endPos === false) {
                         break;
                     }
                     $attStr = substr($this->SourceHtml, $i + 1, $endPos - $startPos - 1);
                     $cAtt->SetSource($attStr);
                     if ($tagName == 'img') {
                         $this->InsertMedia($cAtt->GetAtt('src'), 'img');
                     } else {
                         if ($tagName == 'embed') {
                             $rurl = $this->InsertMedia($cAtt->GetAtt('src'), 'embed');
                             if ($rurl != '') {
                                 $this->MediaInfos[$rurl][0] = $cAtt->GetAtt('width');
                                 $this->MediaInfos[$rurl][1] = $cAtt->GetAtt('height');
                             }
                         } else {
                             if ($tagName == 'a') {
                                 $this->InsertLink($this->FillUrl($cAtt->GetAtt('href')), $this->GetInnerText($i, 'a'));
                             }
                         }
                     }
                 } else {
                     continue;
                 }
             }
             $i--;
         }
         //End if char
     }
     //End for
     if ($this->Title == '') {
         $this->Title = $this->BaseUrl;
     }
 }