function testData_Text_HTMLParser() { $content = "<a href='#'>Go Daddy</a><attr name='attr'>ATTR</attr>\r\n\t\t\t\t\t\t<p align=\"center\"><a href=\"http://www.myspace.com/declareyourself\" target=\"_blank\">\r\n\t\t\t\t\t\t<img src=\"http://creative.myspace.com/groups/_jc/declareyourself/dy_badge.jpg\" border=\"0\" />\r\n\t\t\t\t\t\t</a></p>"; $res = HTMLParser::StripTags($content); $this->assertEqual($res, strip_tags($content), "Error while stripping all tags"); $res = HTMLParser::StripTags($content, 'attr'); $this->assertFalse(stristr($res, '<attr'), "Error while stripping [attr] tag"); $nolinks = HTMLParser::StripTags($content, 'a'); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping [a] tag"); $nolinks = HTMLParser::StripLinks($content); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping links"); $res = HTMLParser::StripScripts($content); $this->assertEqual($res, $content, "Error while stripping scripts"); $res = HTMLParser::StripTags($content, 'img'); $this->assertFalse(stristr($res, 'img'), "Error while stripping [img] tag"); }
/** * Parse personal details from content * * @return array profile details * sample * [userpic] => http://www.livejournal.com/userpic/38353247/8981002 * [name] => Natalie * [website] => http://www.myspace.com/aggressiva * [city] => La Verne * [state] => California * [country] => United States * [birthday] => 1971-12-15 * [aboutme] => party ... I just believe in parties! */ function GetPersonalDetails() { $details = array(); $pattern = '/><[0-9]*b>(user|name|website|location|birthdate|gizmo\/lj talk|bio|e\-mail)\:<[0-9]*\/b><[0-9]*\/td><([0-9]*)td[^>]*>(.*?)<\\2\/td>/msi'; $this->Result = HTMLParser::AddTagDepth($this->Result); preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); $this->Result = HTMLParser::RemoveTagDepth($this->Result); foreach($matches as $match) { $match[2] = HTMLParser::RemoveTagDepth(trim($match[3])); switch ($match[1]) { case 'Name': $details['name'] = $match[2]; break; case 'Website': if (preg_match("/href=(\'|\")(.*?)\\1/", $match[2], $match2)) { $details['website'] = $match2[2]; } break; case 'Location': preg_match_all("/loc\_(ci|st|cn)\=[^>\&]+>(.*?)</msi", $match[2], $match2, PREG_SET_ORDER); foreach($match2 as $res) { if ($res[1] == 'ci') $details['city'] = $res[2]; elseif ($res[1] == 'st') $details['state'] = $res[2]; elseif ($res[1] == 'cn') $details['country'] = $res[2]; } break; case 'Bio': $details['aboutme'] = $match[2]; break; case 'Birthdate': $details['birthday'] = $match[2]; break; case 'E-mail': $details['email'] = HTMLParser::StripTags($match[2]); break; } } if (preg_match("/http\:\/\/([a-z0-9\-]+\.)+[a-z0-9]{2,6}\/userpic\/[0-9]+\/[0-9]+/msi", $this->Result, $matches)) { $details['userpic'] = $matches[0]; } //echo '<xmp>'; print_r($details); echo '</xmp>'; exit; return $details; }
/** * Find differences between multiline strings and return formatted new * string * * @param string $string_old Old string * @param string $string_new New string * @return string New formatted string * @uses HTMLParser HTML Parser method StripTags * @access public */ public function GetHighlitedDiff($string_old, $string_new) { $string_new = HTMLParser::StripTags($string_new); $string_old = HTMLParser::StripTags($string_old); $patch = $this->Diff($string_old, $string_new); if ($patch) { $patch = preg_replace($this->Statements, $this->Replacements, $patch); $string_new = $this->Patch($string_old, $patch); } return $string_new; }
/** * Parse personal details from content * * @return array profile details */ function GetPersonalDetails() { if (!$this->Result) return; $details = array(); $pattern = '/<li>[\s\t\r\n]*<strong>[\s\t\r\n]*(age|gender|industry|occupation|location)\:[\s\t\r\n]*<\/strong>[\s\t\r\n]*(.*?)[\s\t\r\n]*<\/li>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'Age': $details['age'] = $match[2]; break; case 'Gender': $details['gender'] = $match[2]; break; case 'Location': $locations = explode(":", $match[2]); $locations = array_map('trim', $locations); $details['city'] = HTMLParser::StripLinks($locations[0]); $details['state'] = HTMLParser::StripLinks($locations[1]); $details['country'] = HTMLParser::StripLinks($locations[2]); break; case 'Industry': $details['industry'] = $match[2]; break; case 'Occupation': $details['occupation'] = $match[2]; break; } } $pattern = '/<h2>(About\sMe|Interests)<\/h2>[\s\t\r\n]*<([a-z]+)\b[^>]*>(.*?)<\/\\2>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'About Me': $details['aboutme'] = $match[3]; break; // #todo - separators needed case 'Interests': $details['interests'] = HTMLParser::StripTags($match[3]); break; } } return $details; }