function testData_Text_HTMLParser() { $content = "<a href='#'>Go Daddy</a><attr name='attr'>ATTR</attr>\r\n\t\t\t\t\t\t<p align=\"center\"><a href=\"http://www.myspace.com/declareyourself\" target=\"_blank\">\r\n\t\t\t\t\t\t<img src=\"http://creative.myspace.com/groups/_jc/declareyourself/dy_badge.jpg\" border=\"0\" />\r\n\t\t\t\t\t\t</a></p>"; $res = HTMLParser::StripTags($content); $this->assertEqual($res, strip_tags($content), "Error while stripping all tags"); $res = HTMLParser::StripTags($content, 'attr'); $this->assertFalse(stristr($res, '<attr'), "Error while stripping [attr] tag"); $nolinks = HTMLParser::StripTags($content, 'a'); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping [a] tag"); $nolinks = HTMLParser::StripLinks($content); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping links"); $res = HTMLParser::StripScripts($content); $this->assertEqual($res, $content, "Error while stripping scripts"); $res = HTMLParser::StripTags($content, 'img'); $this->assertFalse(stristr($res, 'img'), "Error while stripping [img] tag"); }
/** * Get array of educations * */ public function GetEducation() { $patterns = array('/\<div[\s\t]+name\=\"education\"[\s\t]+id\=\"[a-z0-9]+\"[^\>]*\>(.*?)\<\/div\>[\s\t\n\r]+\<h2/msi'); // get education block $Education = $this->GetMatches($patterns); $Education = HTMLParser::StripLinks($Education); $Education = preg_replace("/\<p[^\>]+\>[\s\t\n\r]*\<em\>[\s\t\n\r]*Activities and Societies\:.*?\<\/p\>/ms", "", $Education); return $Education; }
/** * Parse personal details from content * * @return array Profile details * Example of output array * <code> * [Headline] => "feel easy to live" * [City] => www.fakaofo.tk - Fakaofo * [State] => State Info * [Country] => Tokelau * [Last Login] => 11/27/2006 * [Age] => 25 * [Sex] => Female * [Profile Views] => * [Online] => 1 * [Status] => Single * [Here for] => Networking, Dating, Friends * [Orientation] => Not Sure * [Hometown] => <a href="http://www.fakaofo.tk/">Fakaofo</a> * [Body type] => 5' 2" / Athletic * [Ethnicity] => Pacific Islander * [Religion] => Other * [Zodiac Sign] => <a href="http://collect.myspace.com/index.cfm? * fuseaction=horoscope&sign=11&MyToken=5e6d132b-f9da-4df1-880d- * 06d4a19d0fd2">Aquarius</a> * [Smoke / Drink] => Yes / Yes * [Children] => Love kids, but not for me * [Education] => High school * [Occupation] => Stranger * [General] => 1. sex 2. drugs 3. rock & roll * [Music] => rap, classic music * [Movies] => matrix * [Television] => mtv * [Books] => udar russkix bogov * [Heroes] => no heros * </code> * * @access public */ function GetPersonalDetails() { $details = array(); $patterns = array( '/class\s*\=\s*(?:\'|\")nametext.*?\<table[^\>]+\>(.*?)\<\/table\>/msi' ); // // Match personal info from top block // $this->PersonalInfoBlock = $this->GetMatches($patterns); if (preg_match('/\<td[^\>]+width\s*\=\s*\"193\"[^\>]*\>(.*?)\<\/td\>/msi', $this->PersonalInfoBlock, $matches)) { $entries = preg_split('/\<br[^\>]*\>/ims', $matches[1]); // // parse entries // if ($entries) { $entries = array_map('trim', $entries); if (!in_array(trim($entries[2]), array('Male', 'Female'))) { $this->Headline = $entries[1]; $location = explode(",", $entries[3], 2); $this->City = trim($location[0]); $this->State = trim($location[1]); $this->Country = $entries[4]; $this->ProfileViews = preg_replace('/[^0-9]+/msi', '', $entries[6]); $this->LastLogin = preg_replace('/[^0-9\/]+/msi', '', $entries[10]); if (stristr($entries[7], 'OnlineNow') || stristr($entries[8], 'OnlineNow')) $this->Online = true; } else { $this->Headline = $entries[0]; $this->ProfileViews = preg_replace('/[^0-9]+/msi', '', $entries[1]); $this->Sex = $entries[2]; $this->Age = preg_replace('/[^0-9]+/msi', '', $entries[3]); $location = explode(",", $entries[4], 2); $this->City = trim($location[0]); $this->State = trim($location[1]); $this->Country = $entries[5]; $this->LastLogin = preg_replace('/[^0-9\/]+/msi', '', $entries[8]); if (stristr($entries[7], 'OnlineNow')) $this->Online = true; } $details = array( 'Headline' => $this->Headline, 'City' => $this->City, 'State' => $this->State, 'Country' => $this->Country, 'Last Login' => $this->LastLogin, 'Age' => $this->Age, 'Sex' => $this->Sex, 'Profile Views' => $this->ProfileViews, 'Online' => $this->Online ); } } $patterns = array( '/\'s Details.*?\<table[^\>]+\>(.*?)\<\/table/msi' ); // // Match personal info from middle block - Name's Details // $this->DetailsBlock = $this->GetMatches($patterns); if ($this->DetailsBlock) { $entries = preg_split('/\<\/tr[^\>]*\>/ims', $this->DetailsBlock); // // parse entries // if ($entries) { $entries = array_map('trim', $entries); foreach($entries as $entry) { if (!$entry) continue; if (preg_match('/\<span[^\>]+\>([^\<]+)\<\/span.*?\<td[^\>]+\>(.*?)\<\/td/msi', $entry, $match)) { $key = str_replace(':', '', trim($match[1])); $details[$key] = HTMLParser::StripLinks(trim($match[2])); } } } // end if entries } $patterns = array( '/\'s Interests.*?\<([0-9]+)table[^\>]+\>(.*?)\<\\1\/table/msi' ); // // Match Interests block information // $this->InterestsBlock = $this->GetMatches($patterns, true); if ($this->InterestsBlock) { $entries = preg_split('/<tr[\s\t]+id\=[\'\"a-z]+Row[^>]*>/ims', $this->InterestsBlock); // // parse entries // if ($entries) { $entries = array_map('trim', $entries); foreach($entries as &$entry) { if (!$entry) continue; if (preg_match('/<[0-9]+span[^>]+>([^<]+)<[0-9]+\/span.*?<([0-9]+)td[^>]+>(.*?)<\\2\/td/msi', HTMLParser::AddTagDepth($entry), $match)) { $match[3] = HTMLParser::RemoveTagDepth(trim($match[3])); $key = HTMLParser::RemoveTagDepth(trim($match[1])); $key = str_replace(':', '', $key); // remove link to all groups if ($key == 'Groups') $match[3] = preg_replace('/\<br[^\>]*\>\<br[^\>]*\>.*$/msi', '', $match[3]); $details[$key] = HTMLParser::StripBlankLinks(trim($match[3])); } } } // end if entries } return $details; }
function GetConnectionsList() { if (!$this->LoggedIn) return false; $page = self::CONNECTIONS_URL; $this->Fetch($page); if (!$this->Result) return false; preg_match("/\"numConnections\"[^\>]*\>([0-9]+)\</msi", $this->Result, $match); $total_connections = $match[1] ? $match[1] : 0; preg_match_all("/\<tr [^\>]*name\=\"connection\".*?\_connection([0-9]+).*?name\=\"fullName\"[^\>]*\>(.*?)\<\/strong\>.*?\"email\"[^\>]*\>([^\@]+\@[^\@]+\.[a-z]{2,6})\<.*?\<\/tr\>/msi", $this->Result, $matches, PREG_SET_ORDER); $connections = array(); foreach($matches as $match) { array_push($connections, array( 'id' => $match[1], 'name' => HTMLParser::StripLinks($match[2]), 'email' => $match[3] )); } return $connections; }
/** * Parse personal details from content * * @return array profile details */ function GetPersonalDetails() { if (!$this->Result) return; $details = array(); $pattern = '/<li>[\s\t\r\n]*<strong>[\s\t\r\n]*(age|gender|industry|occupation|location)\:[\s\t\r\n]*<\/strong>[\s\t\r\n]*(.*?)[\s\t\r\n]*<\/li>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'Age': $details['age'] = $match[2]; break; case 'Gender': $details['gender'] = $match[2]; break; case 'Location': $locations = explode(":", $match[2]); $locations = array_map('trim', $locations); $details['city'] = HTMLParser::StripLinks($locations[0]); $details['state'] = HTMLParser::StripLinks($locations[1]); $details['country'] = HTMLParser::StripLinks($locations[2]); break; case 'Industry': $details['industry'] = $match[2]; break; case 'Occupation': $details['occupation'] = $match[2]; break; } } $pattern = '/<h2>(About\sMe|Interests)<\/h2>[\s\t\r\n]*<([a-z]+)\b[^>]*>(.*?)<\/\\2>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'About Me': $details['aboutme'] = $match[3]; break; // #todo - separators needed case 'Interests': $details['interests'] = HTMLParser::StripTags($match[3]); break; } } return $details; }