Exemplos de código com HTMLParser em PHP

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Extract.php Projeto: mizterp/PHP-Web-Article-Extractor

 /**
  *	Extracts an article from HTML
  *
  *	@param  string  $rawHTMLPage the raw HTML from which to extract an article
  *	@return Article extraction result
  */
 public static function extractFromHTML($rawHTMLPage, $source = "")
 {
     $parser = new HTMLParser();
     // Parse HTML into blocks
     $article = $parser->parse($rawHTMLPage);
     // Filter out clean article title
     Filters\TitleFilter::filter($article);
     // Discover article 'end' points using syntactic terminators
     Filters\EndBlockFilter::filter($article);
     // Filter content using word count and link density using algorithm from Machine learning
     Filters\NumberOfWordsFilter::filter($article);
     // Filter blocks that come after content
     Filters\PostcontentFilter::filter($article);
     // Merge close blocks
     Mergers\CloseBlockMerger::merge($article);
     // Remove blocks that are not content
     Filters\NonContentFilter::filter($article);
     // Mark largest block as 'content'
     Filters\LargestBlockFilter::filter($article);
     // Mark blocks found between the title and main content as content as well
     Filters\BetweenTitleAndContentFilter::filter($article);
     // Post-extraction cleanup removing now irrelevant blocks and sets full title
     Filters\PostextractionFilter::filter($article);
     // Scans article line by line removing non-content on a per-line basis
     Filters\LineFilter::filter($article);
     // Determine document language
     Filters\LanguageFilter::filter($article);
     // Filter keywords from the article document
     Filters\KeywordFilter::filter($article);
     $article->source = $source;
     return $article;
 }

Exemplo n.º 2

0

Exibir arquivo

Arquivo: Compiler.php Projeto: icybee/patron

 /**
  * @param string $template
  *
  * @return array
  */
 public function __invoke($template)
 {
     $parser = new HTMLParser([HTMLParser::T_ERROR_HANDLER => function ($message, array $args) {
         throw new \Exception(\ICanBoogie\format($message, $args));
     }]);
     $tree = $parser->parse($template, Engine::PREFIX);
     return $this->parse_html_tree($tree);
 }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: CrawlOutgoingHTMLParser.php Projeto: kba/rssscrpr

 public function parse(Session $session)
 {
     if (!$this->xpathOutgoing) {
         Utils::throw400("Must set 'xpathOutgoing'");
     }
     // the list of xpaths to find outgoing links, ordered by level of hierarchy
     $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing);
     // Let the HTMLParser parse, so we have a DOM
     parent::parse($session);
     // The urls to iterate through in this level of hierarchy
     $crawlUrls = array($session->url);
     // Step through the outgoing link xpaths
     for ($i = 0; $i < count($xpathOutgoingList); $i++) {
         $nextLevelUrls = array();
         $thisLevelXpath = $xpathOutgoingList[$i];
         foreach ($crawlUrls as $url) {
             // create a session
             $subsession = new Session($url);
             // create a fetcher and fetch
             $fetcher = new CachingHttpFetcher();
             $fetcher->fetch($subsession);
             // create a non-crawling HTMLParser and parse
             $parser = new HTMLParser();
             $parser->parse($subsession);
             // Query for URLs of pages to further recurse
             $outLinkNodes = $subsession->xpath->query($thisLevelXpath);
             if ($outLinkNodes === false) {
                 throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
             } else {
                 if ($outLinkNodes->length === 0) {
                     throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]");
                 }
             }
             foreach ($outLinkNodes as $outLinkNode) {
                 $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent);
             }
         }
         $crawlUrls = $nextLevelUrls;
     }
     // Concatenate all the <body> elements into the original document
     foreach ($crawlUrls as $url) {
         // create a session
         $subsession = new Session($url);
         // create a fetcher and fetch
         $fetcher = new CachingHttpFetcher();
         $fetcher->fetch($subsession);
         // create a non-crawling HTMLParser and parse
         $parser = new HTMLParser();
         $parser->parse($subsession);
         $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true);
         $session->dom->documentElement->appendChild($newBody);
     }
     $session->dom->save('/tmp/test3.html');
 }

Exemplo n.º 4

0

Exibir arquivo

Arquivo: PriceMatcher.php Projeto: helin16/pricematch

 /**
  * Getting the price match result
  * 
  * @param array  $companyAliases
  * @param string $sku
  * @param number $myPrice
  * 
  * @return multitype:number multitype: unknown Ambigous <number, mixed>
  */
 public static function getPrices($companyAliases, $sku, $myPrice)
 {
     $myPrice = StringUtilsAbstract::getValueFromCurrency($myPrice);
     //initialize values
     $finalOutputArray = array('sku' => $sku, 'myPrice' => $myPrice, 'minPrice' => 0, 'companyPrices' => array());
     foreach ($companyAliases as $key => $value) {
         $finalOutputArray['companyPrices'][$key] = array('price' => 0, 'priceURL' => '', 'PriceMatchCompanyId' => $value['PriceMatchCompanyId']);
     }
     $url = 'http://www.staticice.com.au/cgi-bin/search.cgi';
     //getting actual values
     $productPriceArray = HTMLParser::getPriceListForProduct($url, $sku);
     foreach ($productPriceArray as $productPriceInfo) {
         if (($companyDetails = trim($productPriceInfo['companyDetails'])) === '') {
             continue;
         }
         $cdArray = explode('|', $companyDetails);
         $companyURL = isset($cdArray[count($cdArray) - 2]) ? trim($cdArray[count($cdArray) - 2]) : trim($companyDetails);
         foreach ($companyAliases as $key => $value) {
             if (is_array($value) === true && in_array(strtolower($companyURL), array_map(create_function('$a', 'return strtolower($a);'), $value))) {
                 $price = str_replace(' ', '', str_replace('$', '', str_replace(',', '', $productPriceInfo['price'])));
                 if ($finalOutputArray['minPrice'] == 0 || $finalOutputArray['minPrice'] > $price) {
                     $finalOutputArray['minPrice'] = $price;
                 }
                 $finalOutputArray['companyPrices'][$key] = array('price' => $price, 'priceURL' => HTMLParser::getHostUrl($url) . $productPriceInfo['priceLink'], 'PriceMatchCompanyId' => $value['PriceMatchCompanyId']);
                 break;
             }
         }
     }
     //return the result
     $finalOutputArray['priceDiff'] = $finalOutputArray['myPrice'] - $finalOutputArray['minPrice'];
     return $finalOutputArray;
 }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: HTMLIncludeFile.php5 Projeto: BackupTheBerlios/freeform-frmwrk

  function onOpen() {
    $this->removeAll();
    $d = $this->getDocument();
    
    if(!($name = $this->getAttribute('name'))) {
      $name = $d->getVariable($this->getAttribute('key'));
    } else {
      if(!($pkg = $this->getAttribute('package'))) {
        // Try to find current package (where calling action defined)
        if(!($a = $d->getResponce()->getRequest()->getParameter('action'))) {
          $pkg = Package::getPackageByName('freeform');
          if($a = $pkg->getProperty('action.default')) {
            $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
          }
        } else {
          $pkg = Package::getPackageByName(Package::getPackageNameForClass($a));
	      }
      } else {
        echo $pkg;
        $pkg = Package::getPackageByName($pkg);
      }
      if($pkg) {
        $name = $pkg->getResourcePath($name);
      }
    }

    if($name) {
      $p = new HTMLParser($this->getDocument());
      $r = $p->parse($name);
      if($r) {
        $r->setExposed(false);
        $this->addNode($r);
        return self::PROCESS_BODY;
      } else {
        return self::SKIP_BODY;
      }
    }
    return self::SKIP_BODY;
  }

Exemplo n.º 6

0

Exibir arquivo

Arquivo: teamspeak.class.php Projeto: kyroskoh/Teamspeak3_LatestVersion

 function doesVersionContainServerBinary($baseURL, $version)
 {
     $return = null;
     $thisBaseURL = $baseURL . $version . "/";
     $HTMLParser = new HTMLParser();
     $HTMLParser->setURL($thisBaseURL);
     $HTMLParser->getHTML();
     if ($HTMLParser->Status == null) {
         $dom = new DOMDocument();
         $dom->loadHTML($HTMLParser->Contents);
         $tableElements = $dom->getElementsByTagName("tr");
         foreach ($tableElements as $tableElement) {
             foreach ($tableElement->childNodes as $tableChildElement) {
                 if ($this->doesElementMatchServerBinaryRules($tableChildElement->nodeValue)) {
                     $return = $tableChildElement->nodeValue;
                     break;
                 }
             }
         }
     } else {
         echo "ERROR";
     }
     return $return;
 }

Exemplo n.º 7

0

Exibir arquivo

Arquivo: tests.php Projeto: jasherai/libwebta

 function testData_Text_HTMLParser()
 {
     $content = "<a href='#'>Go Daddy</a><attr name='attr'>ATTR</attr>\r\n\t\t\t\t\t\t<p align=\"center\"><a href=\"http://www.myspace.com/declareyourself\" target=\"_blank\">\r\n\t\t\t\t\t\t<img src=\"http://creative.myspace.com/groups/_jc/declareyourself/dy_badge.jpg\" border=\"0\" />\r\n\t\t\t\t\t\t</a></p>";
     $res = HTMLParser::StripTags($content);
     $this->assertEqual($res, strip_tags($content), "Error while stripping all tags");
     $res = HTMLParser::StripTags($content, 'attr');
     $this->assertFalse(stristr($res, '<attr'), "Error while stripping [attr] tag");
     $nolinks = HTMLParser::StripTags($content, 'a');
     $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping [a] tag");
     $nolinks = HTMLParser::StripLinks($content);
     $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping links");
     $res = HTMLParser::StripScripts($content);
     $this->assertEqual($res, $content, "Error while stripping scripts");
     $res = HTMLParser::StripTags($content, 'img');
     $this->assertFalse(stristr($res, 'img'), "Error while stripping [img] tag");
 }

Exemplo n.º 8

0

Exibir arquivo

Arquivo: class.MySpaceProfile.php Projeto: rchicoria/epp-drs

		/**
		 * Get information from  `Who I'd like to meet:` block
		 */
		function GetWhoILikeToMeetInfo()
		{
			$patterns = array(
				'/<([0-9]+)td[^>]+>[\n\r\s\t]*<[0-9]+span[^>]+>[\n\r\s\t]*Who I\\\'d like to meet\:[\s\t\n\r]*<[0-9]+\/span>(?:<[0-9]+br[^>]*>)?(.*?)<\\1\/td>/msi'
			);
			
			$content = HTMLParser::StripStyles($this->GetMatches($patterns, true));
			return HTMLParser::StripAds($content);
		}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: POJ_Server.php Projeto: qhpeklh5959/sk_vjudge

 public function getIdFromSource($RunID)
 {
     //Infomation
     $cookie_file = tempnam("./cookie", "cookie");
     $login_url = "http://poj.org/login";
     $post_fields = "user_id1=" . $this->user . "&password1=" . $this->pass . "&url=/";
     //Login
     $curl = curl_init($login_url);
     curl_setopt($curl, CURLOPT_HEADER, 0);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file);
     curl_setopt($curl, CURLOPT_POST, 1);
     curl_setopt($curl, CURLOPT_POSTFIELDS, $post_fields);
     $this->data = curl_exec($curl);
     //Get Source
     $curl = curl_init("http://poj.org/showsource?solution_id=" . $RunID);
     curl_setopt($curl, CURLOPT_HEADER, 0);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);
     $src = curl_exec($curl);
     @unlink($cookie_file);
     $th = new HTMLParser();
     $th->loadHTML($src);
     return $th->innerHTML('//&lt;ID&gt;', '&lt;/ID&gt;');
 }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: class.Facebook.php Projeto: rchicoria/epp-drs

		/**
		 * Login to account
		 * 
		 * @var string email
		 * @var string password
		 */
		function Login($email, $password)
		{
			$this->Username = $email;
			$this->Password = $password;
			$this->LoggedIn = false;
			
			if (!class_exists('FacebookAPI'))
				return;
			
			$this->Client = new FacebookAPI(self::REST_SERVER_ADDR, $this->API_Key, $this->Secret);

			$params = array(
				'email' => trim($this->Username),
				'pass' => trim($this->Password),
				'md5pass' => '',
				'challenge' => md5(time()),
				'noerror' => 1,
				'api_key' => $this->API_Key,
				'next' => '',
				'v' => '1.0'
			);
			
			
			$this->UseRedirects();
			$this->UseCookies();
			$this->ReceiveHeaders();
			$this->Fetch(self::LOGIN_URL, $params, true);

			preg_match("/auth\_token=([a-z0-9]+)/msi", $this->Result, $auth);
			$this->AuthToken = $auth[1];

			if (!$this->AuthToken)
			{
				$form = HTMLParser::GetFormDetails($this->Result, null, 'grant_perm');
				if (!$form) return;
				if ($form['elements']['cancel']) unset($form['elements']['cancel']);

				
				$this->Fetch($form['action'], $form['elements'], true);
				
				if (!$this->Result)
					return false;
	
				preg_match("/auth\_token=([a-z0-9]+)/msi", $this->Result, $auth);
				$this->AuthToken = $auth[1];
			}

			if ($this->AuthToken)
			{
				$this->LoggedIn = true;
				$this->SessionInfo = $this->Client->GetSession($this->AuthToken);
				return true;
			}
		}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: index.php Projeto: kyroskoh/Teamspeak3_LatestVersion

$requestedBitVersion = $_GET['bit'];
if (!($requestedBitVersion == "amd64" || $requestedBitVersion == "x86")) {
    die("<strong>Error</strong> Supported bit versions are: amd64, x86");
}
//Base URL for the repo
$baseURL = "http://dl.4players.de/ts/releases/";
//Builds a URL so that WGET can be used against it.
function buildDownloadURLForWGET($baseURL, $fileDetails)
{
    $fileName = $fileDetails["file"];
    $fileVersion = $fileDetails["version"];
    return $baseURL . $fileVersion . "/" . $fileName;
}
$TeamspeakHandler = new Teamspeak();
$TeamspeakHandler->setBinaryBitRequired($requestedBitVersion);
$HTMLParser = new HTMLParser();
$HTMLParser->setURL($baseURL);
$HTMLParser->getHTML();
if ($HTMLParser->Status == null) {
    $teamspeakVersions = $TeamspeakHandler->getTeamspeakVersionsFromHTML($HTMLParser->Contents);
    if ($teamspeakVersions == null) {
        die("Unknown error, unable to get versions");
    }
    //Switch on the requested format (either x86 or amd64)
    switch ($requestedBitVersion) {
        case "amd64":
            $amd64Binary = null;
            //Rotate around each version found in reverse and find a server binary.
            foreach (array_reverse($teamspeakVersions) as $teamspeakVersion) {
                $serverBinaryAMD64 = $TeamspeakHandler->doesVersionContainServerBinary($baseURL, $teamspeakVersion);
                if ($serverBinaryAMD64 != null && $amd64Binary == null) {

Exemplo n.º 12

0

Exibir arquivo

Arquivo: POJ_Problem.php Projeto: huguangAOA/sk_vjudge

 public function POJ_Problem($id = 1000)
 {
     require_once dirname(__FILE__) . "/HTMLParser.php";
     $html = new HTMLParser("http://poj.org/problem?id=" . $id);
     $html->optHTMLLink();
     $pro_info = array('title' => $html->innerHTML('<div class="ptt" lang="en-US">', '</div>'));
     $prefix = $html->startString('<div class="ptt" lang="en-US">' . $pro_info['title']);
     $pro_info['time'] = intval($html->innerHTML('<div class="plm"><table align="center"><tr><td><b>Time Limit:</b> ', 'MS</td>', $prefix));
     $pro_info['memory'] = intval($html->innerHTML('MS</td><td width="10px"></td><td><b>Memory Limit:</b> ', 'K</td>'));
     $pro_info['submissions'] = intval($html->innerHTML('Total Submissions:</b> ', '</td>'));
     $pro_info['accepted'] = intval($html->innerHTML('</td><td><b>Accepted:</b> ', '</td>'));
     $pro_info['description'] = $html->innerHTML('<p class="pst">Description</p><div class="ptx" lang="en-US">', '</div>');
     $pro_info['input'] = $html->innerHTML('<p class="pst">Input</p><div class="ptx" lang="en-US">', '</div>');
     $pro_info['output'] = $html->innerHTML('<p class="pst">Output</p><div class="ptx" lang="en-US">', '</div>');
     $pro_info['sample_input'] = $html->innerHTML('<p class="pst">Sample Input</p><pre class="sio">', '</pre>');
     $pro_info['sample_output'] = $html->innerHTML('<p class="pst">Sample Output</p><pre class="sio">', '</pre>');
     $pro_info['hint'] = $html->innerHTML('<p class="pst">Hint</p><div class="ptx" lang="en-US">', '</div><p class="pst">Source</p>');
     $pro_info['source'] = $html->innerHTML('<p class="pst">Source</p><div class="ptx" lang="en-US">', '</div>');
     $this->pro_info = $pro_info;
 }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: HDOJ_Problem.php Projeto: huguangAOA/sk_vjudge

 public function HDOJ_Problem($id = 1000)
 {
     require_once dirname(__FILE__) . "/HTMLParser.php";
     $html = new HTMLParser("http://acm.hdu.edu.cn/showproblem.php?pid=" . $id);
     $html->optHTMLLink();
     //Just a hack for OS X
     $sub_start = 0;
     $pro_info = array('title' => substr($html->innerHTML("<h1 style='color:#1A5CC8'>", '</h1>'), $sub_start));
     $pro_info['time'] = substr($html->innerHTML('<span style=\'font-family:Arial;font-size:12px;font-weight:bold;color:green\'>Time Limit: ', ' MS (Java/Others)'), $sub_start);
     $pro_info['memory'] = substr($html->innerHTML('MS (Java/Others)&nbsp;&nbsp;&nbsp;&nbsp;Memory Limit: ', ' K (Java/Others)<br>'), $sub_start);
     $pro_info['submissions'] = substr($html->innerHTML('Total Submission(s): ', '&nbsp;&nbsp;&nbsp;&nbsp;Accepted'), $sub_start);
     $pro_info['accepted'] = substr($html->innerHTML('Accepted Submission(s): ', '<br></span></b></font>'), $sub_start);
     $pro_info['description'] = substr($html->innerHTML('<div class=panel_title align=left>Problem Description</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start);
     $pro_info['input'] = substr($html->innerHTML('<div class=panel_title align=left>Input</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start);
     $pro_info['output'] = substr($html->innerHTML('<div class=panel_title align=left>Output</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start);
     $pro_info['sample_input'] = substr($html->innerHTML('<div class=panel_title align=left>Sample Input</div><div class=panel_content><pre>', '</pre>'), $sub_start);
     $pro_info['sample_output'] = substr($html->innerHTML('<div class=panel_title align=left>Sample Output</div><div class=panel_content><pre>', '</pre>'), $sub_start);
     $pro_info['hint'] = "N/A";
     $pro_info['source'] = substr($html->innerHTML('<div class=panel_title align=left>Author</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start);
     foreach ($pro_info as $k => $v) {
         $pro_info[$k] = iconv('GB2312', 'UTF-8', $v);
     }
     $this->pro_info = $pro_info;
 }

Exemplo n.º 14

0

Exibir arquivo

Arquivo: class.BloggerProfile.php Projeto: rchicoria/epp-drs

		/**
		 * Parse personal details from content
		 * 
		 * @return array profile details
		 */
		function GetPersonalDetails()
		{
			if (!$this->Result) return;
			
			$details = array();
						
			$pattern = '/<li>[\s\t\r\n]*<strong>[\s\t\r\n]*(age|gender|industry|occupation|location)\:[\s\t\r\n]*<\/strong>[\s\t\r\n]*(.*?)[\s\t\r\n]*<\/li>/msi';
			
			preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER);
			 			
			foreach($matches as $match)
			{
				switch ($match[1])
				{
					case 'Age':
						$details['age'] = $match[2];
						break;
					
					case 'Gender':
						$details['gender'] = $match[2];
						break;
					
					case 'Location':
						$locations = explode(":", $match[2]);
						$locations = array_map('trim', $locations);
						
						$details['city'] = HTMLParser::StripLinks($locations[0]);
						$details['state'] = HTMLParser::StripLinks($locations[1]);
						$details['country'] = HTMLParser::StripLinks($locations[2]);
						break;
					
					case 'Industry':
						$details['industry'] = $match[2];
						break;
					
					case 'Occupation':
						$details['occupation'] = $match[2];
						break;
					
				}
			}
			
			
			$pattern = '/<h2>(About\sMe|Interests)<\/h2>[\s\t\r\n]*<([a-z]+)\b[^>]*>(.*?)<\/\\2>/msi';
			preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER);
			 			
			foreach($matches as $match)
			{
				switch ($match[1])
				{
					case 'About Me':
						$details['aboutme'] = $match[3];
						break;
					
					// #todo - separators needed
					case 'Interests':
						$details['interests'] = HTMLParser::StripTags($match[3]);
						break;
				}
			}			

			return $details;
		}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: class.LinkedInProfile.php Projeto: rchicoria/epp-drs

		/**
		 * Get array of educations
		 * 
		 */
		public function GetEducation()
		{
			$patterns = array('/\<div[\s\t]+name\=\"education\"[\s\t]+id\=\"[a-z0-9]+\"[^\>]*\>(.*?)\<\/div\>[\s\t\n\r]+\<h2/msi');
			
			
			// get education block
			$Education = $this->GetMatches($patterns);
			$Education = HTMLParser::StripLinks($Education);
			$Education = preg_replace("/\<p[^\>]+\>[\s\t\n\r]*\<em\>[\s\t\n\r]*Activities and Societies\:.*?\<\/p\>/ms", "", $Education);
			
			return $Education;
		}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: class.LinkedIn.php Projeto: rchicoria/epp-drs

		function GetConnectionsList() 
		{
			if (!$this->LoggedIn) return false;
			
			$page = self::CONNECTIONS_URL;
			
			$this->Fetch($page);

			if (!$this->Result)
				return false;
			
			preg_match("/\"numConnections\"[^\>]*\>([0-9]+)\</msi", $this->Result, $match);
			$total_connections = $match[1] ? $match[1] : 0;
			
			preg_match_all("/\<tr [^\>]*name\=\"connection\".*?\_connection([0-9]+).*?name\=\"fullName\"[^\>]*\>(.*?)\<\/strong\>.*?\"email\"[^\>]*\>([^\@]+\@[^\@]+\.[a-z]{2,6})\<.*?\<\/tr\>/msi", $this->Result, $matches, PREG_SET_ORDER);
			
			$connections = array();
			
			foreach($matches as $match)
			{
				array_push($connections, array(
					'id'	=> $match[1],
					'name'	=> HTMLParser::StripLinks($match[2]),
					'email'	=> $match[3]
				));
			}
			
			return $connections;
		}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: HTMLScraping.class.php Projeto: diggin-sandbox/mirror-htmlscraping-20090114

 /**
  * Return array contains formated XHTML string
  * created from the responded HTML of the given URL.
  * array[code] => HTTP status code
  * array[headers] => HTTP headers
  * array[headers] => formated XHTML string made from the entity body
  * Throw exception if error.
  *
  * @param  string  $url
  * @param  integer $cache_lifetime
  * @param  boolean $conditional_request
  * @param  array   $headers
  * @param  array   $post
  * @return array
  */
 public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array())
 {
     /*
      * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e
      */
     if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) {
         throw new Exception("Not a valid or fully qualified HTTP URL.");
     }
     $data = false;
     $cache_lifetime = (int) $cache_lifetime;
     $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
     if ($use_cache) {
         $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
         $params = array();
         foreach ($headers as $key => $value) {
             if (!empty($value)) {
                 $params[] = urlencode($key) . '=' . urlencode($value);
             }
         }
         foreach ($post as $key => $value) {
             $params[] = urlencode($key) . '=' . urlencode($value);
         }
         $cache_id = "{$url}?" . implode('&', $params);
         if (false !== ($data = $cache->get($cache_id))) {
             $data = unserialize($data);
         }
     }
     /*
      * Access to the URL if not cached
      * or if the cache has either Last-Modified or Etag header
      * and conditional request is specified.
      */
     if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
         $conditional_request = false;
     }
     if (!$data or $conditional_request) {
         if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
             $headers['last-modified'] = $data['headers']['last-modified'];
         }
         if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) {
             $headers['etag'] = $data['headers']['etag'];
         }
         try {
             $response = $this->getHttpResponse($url, $headers, $post);
         } catch (Exception $e) {
             if (!$data) {
                 throw $e;
             }
         }
         /*
          * Use cache if the responded HTTP status code is 304.
          * If 200, format the responded HTML of the given URL to XHTML.
          */
         if (!$data or isset($response['code']) and $response['code'] != 304) {
             $data =& $response;
             /*
              * If status code was 200 and Content-Type was not (X)HTML,
              * the status code was forcibly altered to 204.
              * @see HTTP_Request_Listener_Extended->update().
              */
             if ($data['code'] != 200 and $data['code'] != 204) {
                 throw new Exception("Responded HTTP Status Code is {$data['code']}.");
             } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) {
                 throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
             } elseif (empty($data['body'])) {
                 throw new Exception("Responded entity body is empty.");
             } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) {
                 throw new Exception("Responded entity body does not contain a markup symbol.");
             } elseif (false !== strpos($matches[0], "")) {
                 throw new Exception("Responded entity body contains NULL.");
             }
             /*
              * Remove BOM and NULLs.
              */
             $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']);
             $data['body'] = str_replace("", '', $data['body']);
             /*
              * Initialize the backups.
              */
             $this->backup = array();
             $this->backup_count = 0;
             /*
              * Removing SCRIPT and STYLE is recommended.
              * The following substitute code will capsulate the content of the tags in CDATA.
              * If use it, be sure that some JavaScript method such as document.write
              * is not compliant with XHTML/XML.
              */
             $tags = array('script', 'style');
             foreach ($tags as $tag) {
                 $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']);
                 /*
                 $data['body'] = preg_replace_callback(
                     "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
                     create_function('$matches', '
                         $content = trim($matches[2]);
                         if (empty($content)
                             or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
                             return $matches[0];
                         } else {
                             $content = preg_replace("/^<!-+/", "", $content);
                             $content = preg_replace("/-+>$/", "", $content);
                             $content = preg_replace("/\s*\/\/$/s", "", trim($content));
                             return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
                         }
                     '),
                     $data['body']
                 );
                 */
             }
             /*
              * Backup CDATA sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']);
             /*
              * Comment section must not contain two or more adjacent hyphens.
              */
             $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', '
                     return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
                 '), $data['body']);
             /*
              * Backup comment sections for later process.
              */
             $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']);
             /*
              * Process tags that is potentially dangerous for XML parsers.
              */
             $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', '
                     return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', '
                     return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
                 '), $data['body']);
             /*
              * Remove DTD declarations, wrongly placed comments etc.
              * This must be done before removing DOCTYPE.
              */
             $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
             /*
              * XML and DOCTYPE declaration will be replaced.
              */
             $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']);
             $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']);
             if (preg_match('/^\\s*$/s', $data['body'])) {
                 throw new Exception('The entity body became empty after preprocessing.');
             }
             /*
              * Detect character encoding and convert to UTF-8.
              */
             $encoding = false;
             if (isset($data['headers']['content-type'])) {
                 $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
             }
             if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) {
                 foreach ($matches[0] as $value) {
                     if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) {
                         $encoding = $this->getCharsetFromCType($encoding);
                         break;
                     }
                 }
             }
             /*
              * Use mbstring to convert character encoding if available.
              * Otherwise use iconv (iconv may try to detect character encoding automatically).
              * Do not trust the declared encoding and do conversion even if UTF-8.
              */
             if (extension_loaded('mbstring')) {
                 if (!$encoding) {
                     @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
                     if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) {
                         throw new Exception('Failed detecting character encoding.');
                     }
                 }
                 @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
             } else {
                 if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) {
                     throw new Exception('Failed converting character encoding.');
                 }
                 foreach ($this->backup as $key => $value) {
                     if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) {
                         throw new Exception('Failed converting character encoding.');
                     }
                 }
             }
             /*
              * Restore CDATAs and comments.
              */
             for ($i = 0; $i < $this->backup_count; $i++) {
                 $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']);
             }
             /*
              * Use Tidy to format HTML if available.
              * Otherwise, use HTMLParser class (is slower and consumes much memory).
              */
             if (extension_loaded('tidy')) {
                 $tidy = new tidy();
                 $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
                 $tidy->cleanRepair();
                 $data['body'] = $tidy->html();
             } else {
                 require_once 'HTMLParser.class.php';
                 $parser = new HTMLParser();
                 $format_rule = (require 'xhtml1-transitional_dtd.inc.php');
                 $parser->setRule($format_rule);
                 $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
                 $parser->setGenericParent('body');
                 $parser->parse($data['body']);
                 $data['body'] = $parser->dump();
             }
             /*
              * Valid XHTML DOCTYPE declaration (with DTD URI) is required
              * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
              */
             $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
             $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
             $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
             $data['body'] = "{$declarations}{$data['body']}";
             if ($use_cache) {
                 $cache->save(serialize($data), $cache_id);
             }
         }
     }
     return $data;
 }

Exemplo n.º 18

0

Exibir arquivo

Arquivo: class.DiffTool.php Projeto: rchicoria/epp-drs

		/**
		 * Find differences between multiline strings and return formatted new
		 * string
		 * 
		 * @param string $string_old Old string
		 * @param string $string_new New string
		 * @return string New formatted string
		 * @uses HTMLParser HTML Parser method StripTags
		 * @access public
		 */
		public function GetHighlitedDiff($string_old, $string_new) 
		{
			$string_new = HTMLParser::StripTags($string_new);
			$string_old = HTMLParser::StripTags($string_old);
			
			$patch = $this->Diff($string_old, $string_new);

			if ($patch)
			{
				$patch = preg_replace($this->Statements, $this->Replacements, $patch);
				$string_new = $this->Patch($string_old, $patch);
			}

			return $string_new;
		}

Exemplo n.º 19

0

Exibir arquivo

Arquivo: SimpleExcel.php Projeto: arlendotcn/ilias

 /**
  * Change writer type to convert to another format
  * 
  * @param    string  $filetype   Set the filetype of the file which will be written (XML/CSV/TSV/HTML/JSON)
  */
 public function convertTo($filetype)
 {
     $this->constructWriter($filetype);
     $this->writer->setData($this->parser->getField());
 }

Exemplo n.º 20

0

Exibir arquivo

Arquivo: HDOJ_DataPoster.php Projeto: huguangAOA/sk_vjudge

 public function getIdFromSource($RunID)
 {
     //Get Source
     $curl = curl_init("http://acm.hdu.edu.cn/viewcode.php?rid=" . $RunID);
     curl_setopt($curl, CURLOPT_HEADER, 0);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookie_file);
     $src = curl_exec($curl);
     $th = new HTMLParser();
     $th->loadHTML($src);
     return $th->innerHTML('//&lt;ID&gt;', '&lt;/ID&gt;');
 }

Exemplo n.º 21

0

Exibir arquivo

Arquivo: PriceMatchConnector.php Projeto: larryu/magento-b2b

 /**
  * Getting the price match result
  * 
  * @return array
  */
 private function getPrices()
 {
     $result = array();
     $priceMatchResults = HTMLParser::getPriceListForProduct($this->base_url, $this->sku);
     foreach ($priceMatchResults as $priceMatchResult) {
         if (($companyDetails = trim($priceMatchResult['companyDetails'])) === '') {
             continue;
         }
         $companyDetailsArray = explode('|', $companyDetails);
         $companyURL = isset($companyDetailsArray[count($companyDetailsArray) - 2]) ? trim($companyDetailsArray[count($companyDetailsArray) - 2]) : trim($companyDetails);
         $companyURL = strtolower($companyURL);
         $companyURL = str_replace('https://', '', $companyURL);
         $companyURL = str_replace('http://', '', $companyURL);
         $name = isset($companyDetailsArray[count($companyDetailsArray) - 3]) ? trim($companyDetailsArray[count($companyDetailsArray) - 3]) : trim($companyDetails);
         $price = str_replace(' ', '', str_replace('$', '', str_replace(',', '', $priceMatchResult['price'])));
         $url = HTMLParser::getHostUrl($this->base_url) . $priceMatchResult['priceLink'];
         foreach (PriceMatchCompany::getAll() as $company) {
             if ($companyURL === strtolower($company->getCompanyAlias())) {
                 $result[] = array('PriceMatchCompany' => $company, 'price' => $price, 'name' => $name, 'url' => $url);
                 if ($this->debug === true) {
                     echo $company->getCompanyName() . '(id=' . $company->getId() . "), \$" . $price . "\n";
                 }
             }
         }
     }
     return $result;
 }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: class.LiveJournalProfile.php Projeto: rchicoria/epp-drs

		/**
		 * Parse personal details from content
		 * 
		 * @return array profile details
		 * sample
		 *   [userpic]   => http://www.livejournal.com/userpic/38353247/8981002
		 *   [name]   => Natalie
		 *   [website]   => http://www.myspace.com/aggressiva
		 *   [city]   => La Verne
		 *   [state]   => California
		 *   [country]   => United States
		 *   [birthday] => 1971-12-15
		 *   [aboutme]	=> party ... I just believe in parties!
		 */
		function GetPersonalDetails()
		{
			$details = array();
						
			$pattern = '/><[0-9]*b>(user|name|website|location|birthdate|gizmo\/lj talk|bio|e\-mail)\:<[0-9]*\/b><[0-9]*\/td><([0-9]*)td[^>]*>(.*?)<\\2\/td>/msi';
			
			$this->Result = HTMLParser::AddTagDepth($this->Result);
			preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER);
			$this->Result = HTMLParser::RemoveTagDepth($this->Result);
			
			foreach($matches as $match)
			{
				$match[2] = HTMLParser::RemoveTagDepth(trim($match[3]));
				
				switch ($match[1])
				{
					case 'Name':
						$details['name'] = $match[2];
						break;
					
					case 'Website':
						if (preg_match("/href=(\'|\")(.*?)\\1/", $match[2], $match2))
						{
							$details['website'] = $match2[2];
						}
						break;
					
					case 'Location':
						preg_match_all("/loc\_(ci|st|cn)\=[^>\&]+>(.*?)</msi", $match[2], $match2, PREG_SET_ORDER);
						
						foreach($match2 as $res)
						{
							if ($res[1] == 'ci')
								$details['city'] = $res[2];
							elseif ($res[1] == 'st')
								$details['state'] = $res[2];
							elseif ($res[1] == 'cn')
								$details['country'] = $res[2];
						}
						break;
					
					case 'Bio':
						$details['aboutme'] = $match[2];
						break;
					
					case 'Birthdate':
						$details['birthday'] = $match[2];
						break;
					
					case 'E-mail':
						$details['email'] = HTMLParser::StripTags($match[2]);
						
						break;
				}
			}
			
			
			if (preg_match("/http\:\/\/([a-z0-9\-]+\.)+[a-z0-9]{2,6}\/userpic\/[0-9]+\/[0-9]+/msi", $this->Result, $matches))
			{
				$details['userpic'] = $matches[0];
			}
			
			//echo '<xmp>'; print_r($details); echo '</xmp>'; exit;
			return $details;
		}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: HDOJ_Record.php Projeto: huguangAOA/sk_vjudge

 public function getIdFromSource($RunID)
 {
     $cookie_file = tempnam("./cookie", "cookie");
     $this->cookie_file = $cookie_file;
     $login_url = "http://acm.hdu.edu.cn/userloginex.php?action=login";
     $post_fields = "username="******"&userpass="******"&login=Sign In";
     //Login
     $curl = curl_init($login_url);
     curl_setopt($curl, CURLOPT_HEADER, 0);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file);
     curl_setopt($curl, CURLOPT_POST, 1);
     curl_setopt($curl, CURLOPT_POSTFIELDS, $post_fields);
     $this->data = curl_exec($curl);
     //Get Source
     $curl = curl_init("http://acm.hdu.edu.cn/viewcode.php?rid=" . $RunID);
     curl_setopt($curl, CURLOPT_HEADER, 0);
     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
     curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookie_file);
     $src = curl_exec($curl);
     $th = new HTMLParser();
     $th->loadHTML($src);
     return $th->innerHTML('//&lt;ID&gt;', '&lt;/ID&gt;');
 }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: parser.php Projeto: krishnakt031990/Html-Parser

                }
            }
        }
    }
    public function getHeader()
    {
        return $this->pageHeader;
    }
    public function getHtmlUrl()
    {
        return $this->htmlurl;
    }
    public function getImageHolder()
    {
        return $this->imageHolder;
    }
    public function getBodyHolder()
    {
        return $this->bodyHolder;
    }
}
$htmlParser = new HTMLParser($_POST["cnnurl"]);
// echo $object->getHtmlUrl();
echo $htmlParser->getHeader();
$imagePlaceHolder = $htmlParser->getImageHolder();
echo $imagePlaceHolder[0];
// Saw that only the first position image is relavent in most of the articles. Hence using only 0 instead of the for loop.
$bodyHolder = $htmlParser->getBodyHolder();
foreach ($bodyHolder as $content) {
    echo $content;
}

Exemplos de HTMLParser em PHP