/** * Extracts an article from HTML * * @param string $rawHTMLPage the raw HTML from which to extract an article * @return Article extraction result */ public static function extractFromHTML($rawHTMLPage, $source = "") { $parser = new HTMLParser(); // Parse HTML into blocks $article = $parser->parse($rawHTMLPage); // Filter out clean article title Filters\TitleFilter::filter($article); // Discover article 'end' points using syntactic terminators Filters\EndBlockFilter::filter($article); // Filter content using word count and link density using algorithm from Machine learning Filters\NumberOfWordsFilter::filter($article); // Filter blocks that come after content Filters\PostcontentFilter::filter($article); // Merge close blocks Mergers\CloseBlockMerger::merge($article); // Remove blocks that are not content Filters\NonContentFilter::filter($article); // Mark largest block as 'content' Filters\LargestBlockFilter::filter($article); // Mark blocks found between the title and main content as content as well Filters\BetweenTitleAndContentFilter::filter($article); // Post-extraction cleanup removing now irrelevant blocks and sets full title Filters\PostextractionFilter::filter($article); // Scans article line by line removing non-content on a per-line basis Filters\LineFilter::filter($article); // Determine document language Filters\LanguageFilter::filter($article); // Filter keywords from the article document Filters\KeywordFilter::filter($article); $article->source = $source; return $article; }
/** * @param string $template * * @return array */ public function __invoke($template) { $parser = new HTMLParser([HTMLParser::T_ERROR_HANDLER => function ($message, array $args) { throw new \Exception(\ICanBoogie\format($message, $args)); }]); $tree = $parser->parse($template, Engine::PREFIX); return $this->parse_html_tree($tree); }
public function parse(Session $session) { if (!$this->xpathOutgoing) { Utils::throw400("Must set 'xpathOutgoing'"); } // the list of xpaths to find outgoing links, ordered by level of hierarchy $xpathOutgoingList = preg_split("/\\s*,\\s*/", $this->xpathOutgoing); // Let the HTMLParser parse, so we have a DOM parent::parse($session); // The urls to iterate through in this level of hierarchy $crawlUrls = array($session->url); // Step through the outgoing link xpaths for ($i = 0; $i < count($xpathOutgoingList); $i++) { $nextLevelUrls = array(); $thisLevelXpath = $xpathOutgoingList[$i]; foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); // Query for URLs of pages to further recurse $outLinkNodes = $subsession->xpath->query($thisLevelXpath); if ($outLinkNodes === false) { throw Utils::throw400("Xpath query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } else { if ($outLinkNodes->length === 0) { throw Utils::throw400("No results for query '{$thisLevelXpath}' failed for '{$url}' [Level: {$i}]"); } } foreach ($outLinkNodes as $outLinkNode) { $nextLevelUrls[] = $subsession->ensureAbsoluteUrl($outLinkNode->textContent); } } $crawlUrls = $nextLevelUrls; } // Concatenate all the <body> elements into the original document foreach ($crawlUrls as $url) { // create a session $subsession = new Session($url); // create a fetcher and fetch $fetcher = new CachingHttpFetcher(); $fetcher->fetch($subsession); // create a non-crawling HTMLParser and parse $parser = new HTMLParser(); $parser->parse($subsession); $newBody = $session->dom->importNode($subsession->dom->getElementsByTagName('body')->item(0), true); $session->dom->documentElement->appendChild($newBody); } $session->dom->save('/tmp/test3.html'); }
/** * Getting the price match result * * @param array $companyAliases * @param string $sku * @param number $myPrice * * @return multitype:number multitype: unknown Ambigous <number, mixed> */ public static function getPrices($companyAliases, $sku, $myPrice) { $myPrice = StringUtilsAbstract::getValueFromCurrency($myPrice); //initialize values $finalOutputArray = array('sku' => $sku, 'myPrice' => $myPrice, 'minPrice' => 0, 'companyPrices' => array()); foreach ($companyAliases as $key => $value) { $finalOutputArray['companyPrices'][$key] = array('price' => 0, 'priceURL' => '', 'PriceMatchCompanyId' => $value['PriceMatchCompanyId']); } $url = 'http://www.staticice.com.au/cgi-bin/search.cgi'; //getting actual values $productPriceArray = HTMLParser::getPriceListForProduct($url, $sku); foreach ($productPriceArray as $productPriceInfo) { if (($companyDetails = trim($productPriceInfo['companyDetails'])) === '') { continue; } $cdArray = explode('|', $companyDetails); $companyURL = isset($cdArray[count($cdArray) - 2]) ? trim($cdArray[count($cdArray) - 2]) : trim($companyDetails); foreach ($companyAliases as $key => $value) { if (is_array($value) === true && in_array(strtolower($companyURL), array_map(create_function('$a', 'return strtolower($a);'), $value))) { $price = str_replace(' ', '', str_replace('$', '', str_replace(',', '', $productPriceInfo['price']))); if ($finalOutputArray['minPrice'] == 0 || $finalOutputArray['minPrice'] > $price) { $finalOutputArray['minPrice'] = $price; } $finalOutputArray['companyPrices'][$key] = array('price' => $price, 'priceURL' => HTMLParser::getHostUrl($url) . $productPriceInfo['priceLink'], 'PriceMatchCompanyId' => $value['PriceMatchCompanyId']); break; } } } //return the result $finalOutputArray['priceDiff'] = $finalOutputArray['myPrice'] - $finalOutputArray['minPrice']; return $finalOutputArray; }
function onOpen() { $this->removeAll(); $d = $this->getDocument(); if(!($name = $this->getAttribute('name'))) { $name = $d->getVariable($this->getAttribute('key')); } else { if(!($pkg = $this->getAttribute('package'))) { // Try to find current package (where calling action defined) if(!($a = $d->getResponce()->getRequest()->getParameter('action'))) { $pkg = Package::getPackageByName('freeform'); if($a = $pkg->getProperty('action.default')) { $pkg = Package::getPackageByName(Package::getPackageNameForClass($a)); } } else { $pkg = Package::getPackageByName(Package::getPackageNameForClass($a)); } } else { echo $pkg; $pkg = Package::getPackageByName($pkg); } if($pkg) { $name = $pkg->getResourcePath($name); } } if($name) { $p = new HTMLParser($this->getDocument()); $r = $p->parse($name); if($r) { $r->setExposed(false); $this->addNode($r); return self::PROCESS_BODY; } else { return self::SKIP_BODY; } } return self::SKIP_BODY; }
function doesVersionContainServerBinary($baseURL, $version) { $return = null; $thisBaseURL = $baseURL . $version . "/"; $HTMLParser = new HTMLParser(); $HTMLParser->setURL($thisBaseURL); $HTMLParser->getHTML(); if ($HTMLParser->Status == null) { $dom = new DOMDocument(); $dom->loadHTML($HTMLParser->Contents); $tableElements = $dom->getElementsByTagName("tr"); foreach ($tableElements as $tableElement) { foreach ($tableElement->childNodes as $tableChildElement) { if ($this->doesElementMatchServerBinaryRules($tableChildElement->nodeValue)) { $return = $tableChildElement->nodeValue; break; } } } } else { echo "ERROR"; } return $return; }
function testData_Text_HTMLParser() { $content = "<a href='#'>Go Daddy</a><attr name='attr'>ATTR</attr>\r\n\t\t\t\t\t\t<p align=\"center\"><a href=\"http://www.myspace.com/declareyourself\" target=\"_blank\">\r\n\t\t\t\t\t\t<img src=\"http://creative.myspace.com/groups/_jc/declareyourself/dy_badge.jpg\" border=\"0\" />\r\n\t\t\t\t\t\t</a></p>"; $res = HTMLParser::StripTags($content); $this->assertEqual($res, strip_tags($content), "Error while stripping all tags"); $res = HTMLParser::StripTags($content, 'attr'); $this->assertFalse(stristr($res, '<attr'), "Error while stripping [attr] tag"); $nolinks = HTMLParser::StripTags($content, 'a'); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping [a] tag"); $nolinks = HTMLParser::StripLinks($content); $this->assertFalse(stristr($nolinks, 'href'), "Error while stripping links"); $res = HTMLParser::StripScripts($content); $this->assertEqual($res, $content, "Error while stripping scripts"); $res = HTMLParser::StripTags($content, 'img'); $this->assertFalse(stristr($res, 'img'), "Error while stripping [img] tag"); }
/** * Get information from `Who I'd like to meet:` block */ function GetWhoILikeToMeetInfo() { $patterns = array( '/<([0-9]+)td[^>]+>[\n\r\s\t]*<[0-9]+span[^>]+>[\n\r\s\t]*Who I\\\'d like to meet\:[\s\t\n\r]*<[0-9]+\/span>(?:<[0-9]+br[^>]*>)?(.*?)<\\1\/td>/msi' ); $content = HTMLParser::StripStyles($this->GetMatches($patterns, true)); return HTMLParser::StripAds($content); }
public function getIdFromSource($RunID) { //Infomation $cookie_file = tempnam("./cookie", "cookie"); $login_url = "http://poj.org/login"; $post_fields = "user_id1=" . $this->user . "&password1=" . $this->pass . "&url=/"; //Login $curl = curl_init($login_url); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file); curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $post_fields); $this->data = curl_exec($curl); //Get Source $curl = curl_init("http://poj.org/showsource?solution_id=" . $RunID); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file); $src = curl_exec($curl); @unlink($cookie_file); $th = new HTMLParser(); $th->loadHTML($src); return $th->innerHTML('//<ID>', '</ID>'); }
/** * Login to account * * @var string email * @var string password */ function Login($email, $password) { $this->Username = $email; $this->Password = $password; $this->LoggedIn = false; if (!class_exists('FacebookAPI')) return; $this->Client = new FacebookAPI(self::REST_SERVER_ADDR, $this->API_Key, $this->Secret); $params = array( 'email' => trim($this->Username), 'pass' => trim($this->Password), 'md5pass' => '', 'challenge' => md5(time()), 'noerror' => 1, 'api_key' => $this->API_Key, 'next' => '', 'v' => '1.0' ); $this->UseRedirects(); $this->UseCookies(); $this->ReceiveHeaders(); $this->Fetch(self::LOGIN_URL, $params, true); preg_match("/auth\_token=([a-z0-9]+)/msi", $this->Result, $auth); $this->AuthToken = $auth[1]; if (!$this->AuthToken) { $form = HTMLParser::GetFormDetails($this->Result, null, 'grant_perm'); if (!$form) return; if ($form['elements']['cancel']) unset($form['elements']['cancel']); $this->Fetch($form['action'], $form['elements'], true); if (!$this->Result) return false; preg_match("/auth\_token=([a-z0-9]+)/msi", $this->Result, $auth); $this->AuthToken = $auth[1]; } if ($this->AuthToken) { $this->LoggedIn = true; $this->SessionInfo = $this->Client->GetSession($this->AuthToken); return true; } }
$requestedBitVersion = $_GET['bit']; if (!($requestedBitVersion == "amd64" || $requestedBitVersion == "x86")) { die("<strong>Error</strong> Supported bit versions are: amd64, x86"); } //Base URL for the repo $baseURL = "http://dl.4players.de/ts/releases/"; //Builds a URL so that WGET can be used against it. function buildDownloadURLForWGET($baseURL, $fileDetails) { $fileName = $fileDetails["file"]; $fileVersion = $fileDetails["version"]; return $baseURL . $fileVersion . "/" . $fileName; } $TeamspeakHandler = new Teamspeak(); $TeamspeakHandler->setBinaryBitRequired($requestedBitVersion); $HTMLParser = new HTMLParser(); $HTMLParser->setURL($baseURL); $HTMLParser->getHTML(); if ($HTMLParser->Status == null) { $teamspeakVersions = $TeamspeakHandler->getTeamspeakVersionsFromHTML($HTMLParser->Contents); if ($teamspeakVersions == null) { die("Unknown error, unable to get versions"); } //Switch on the requested format (either x86 or amd64) switch ($requestedBitVersion) { case "amd64": $amd64Binary = null; //Rotate around each version found in reverse and find a server binary. foreach (array_reverse($teamspeakVersions) as $teamspeakVersion) { $serverBinaryAMD64 = $TeamspeakHandler->doesVersionContainServerBinary($baseURL, $teamspeakVersion); if ($serverBinaryAMD64 != null && $amd64Binary == null) {
public function POJ_Problem($id = 1000) { require_once dirname(__FILE__) . "/HTMLParser.php"; $html = new HTMLParser("http://poj.org/problem?id=" . $id); $html->optHTMLLink(); $pro_info = array('title' => $html->innerHTML('<div class="ptt" lang="en-US">', '</div>')); $prefix = $html->startString('<div class="ptt" lang="en-US">' . $pro_info['title']); $pro_info['time'] = intval($html->innerHTML('<div class="plm"><table align="center"><tr><td><b>Time Limit:</b> ', 'MS</td>', $prefix)); $pro_info['memory'] = intval($html->innerHTML('MS</td><td width="10px"></td><td><b>Memory Limit:</b> ', 'K</td>')); $pro_info['submissions'] = intval($html->innerHTML('Total Submissions:</b> ', '</td>')); $pro_info['accepted'] = intval($html->innerHTML('</td><td><b>Accepted:</b> ', '</td>')); $pro_info['description'] = $html->innerHTML('<p class="pst">Description</p><div class="ptx" lang="en-US">', '</div>'); $pro_info['input'] = $html->innerHTML('<p class="pst">Input</p><div class="ptx" lang="en-US">', '</div>'); $pro_info['output'] = $html->innerHTML('<p class="pst">Output</p><div class="ptx" lang="en-US">', '</div>'); $pro_info['sample_input'] = $html->innerHTML('<p class="pst">Sample Input</p><pre class="sio">', '</pre>'); $pro_info['sample_output'] = $html->innerHTML('<p class="pst">Sample Output</p><pre class="sio">', '</pre>'); $pro_info['hint'] = $html->innerHTML('<p class="pst">Hint</p><div class="ptx" lang="en-US">', '</div><p class="pst">Source</p>'); $pro_info['source'] = $html->innerHTML('<p class="pst">Source</p><div class="ptx" lang="en-US">', '</div>'); $this->pro_info = $pro_info; }
public function HDOJ_Problem($id = 1000) { require_once dirname(__FILE__) . "/HTMLParser.php"; $html = new HTMLParser("http://acm.hdu.edu.cn/showproblem.php?pid=" . $id); $html->optHTMLLink(); //Just a hack for OS X $sub_start = 0; $pro_info = array('title' => substr($html->innerHTML("<h1 style='color:#1A5CC8'>", '</h1>'), $sub_start)); $pro_info['time'] = substr($html->innerHTML('<span style=\'font-family:Arial;font-size:12px;font-weight:bold;color:green\'>Time Limit: ', ' MS (Java/Others)'), $sub_start); $pro_info['memory'] = substr($html->innerHTML('MS (Java/Others) Memory Limit: ', ' K (Java/Others)<br>'), $sub_start); $pro_info['submissions'] = substr($html->innerHTML('Total Submission(s): ', ' Accepted'), $sub_start); $pro_info['accepted'] = substr($html->innerHTML('Accepted Submission(s): ', '<br></span></b></font>'), $sub_start); $pro_info['description'] = substr($html->innerHTML('<div class=panel_title align=left>Problem Description</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start); $pro_info['input'] = substr($html->innerHTML('<div class=panel_title align=left>Input</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start); $pro_info['output'] = substr($html->innerHTML('<div class=panel_title align=left>Output</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start); $pro_info['sample_input'] = substr($html->innerHTML('<div class=panel_title align=left>Sample Input</div><div class=panel_content><pre>', '</pre>'), $sub_start); $pro_info['sample_output'] = substr($html->innerHTML('<div class=panel_title align=left>Sample Output</div><div class=panel_content><pre>', '</pre>'), $sub_start); $pro_info['hint'] = "N/A"; $pro_info['source'] = substr($html->innerHTML('<div class=panel_title align=left>Author</div> <div class=panel_content>', '</div><div class=panel_bottom>'), $sub_start); foreach ($pro_info as $k => $v) { $pro_info[$k] = iconv('GB2312', 'UTF-8', $v); } $this->pro_info = $pro_info; }
/** * Parse personal details from content * * @return array profile details */ function GetPersonalDetails() { if (!$this->Result) return; $details = array(); $pattern = '/<li>[\s\t\r\n]*<strong>[\s\t\r\n]*(age|gender|industry|occupation|location)\:[\s\t\r\n]*<\/strong>[\s\t\r\n]*(.*?)[\s\t\r\n]*<\/li>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'Age': $details['age'] = $match[2]; break; case 'Gender': $details['gender'] = $match[2]; break; case 'Location': $locations = explode(":", $match[2]); $locations = array_map('trim', $locations); $details['city'] = HTMLParser::StripLinks($locations[0]); $details['state'] = HTMLParser::StripLinks($locations[1]); $details['country'] = HTMLParser::StripLinks($locations[2]); break; case 'Industry': $details['industry'] = $match[2]; break; case 'Occupation': $details['occupation'] = $match[2]; break; } } $pattern = '/<h2>(About\sMe|Interests)<\/h2>[\s\t\r\n]*<([a-z]+)\b[^>]*>(.*?)<\/\\2>/msi'; preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); foreach($matches as $match) { switch ($match[1]) { case 'About Me': $details['aboutme'] = $match[3]; break; // #todo - separators needed case 'Interests': $details['interests'] = HTMLParser::StripTags($match[3]); break; } } return $details; }
/** * Get array of educations * */ public function GetEducation() { $patterns = array('/\<div[\s\t]+name\=\"education\"[\s\t]+id\=\"[a-z0-9]+\"[^\>]*\>(.*?)\<\/div\>[\s\t\n\r]+\<h2/msi'); // get education block $Education = $this->GetMatches($patterns); $Education = HTMLParser::StripLinks($Education); $Education = preg_replace("/\<p[^\>]+\>[\s\t\n\r]*\<em\>[\s\t\n\r]*Activities and Societies\:.*?\<\/p\>/ms", "", $Education); return $Education; }
function GetConnectionsList() { if (!$this->LoggedIn) return false; $page = self::CONNECTIONS_URL; $this->Fetch($page); if (!$this->Result) return false; preg_match("/\"numConnections\"[^\>]*\>([0-9]+)\</msi", $this->Result, $match); $total_connections = $match[1] ? $match[1] : 0; preg_match_all("/\<tr [^\>]*name\=\"connection\".*?\_connection([0-9]+).*?name\=\"fullName\"[^\>]*\>(.*?)\<\/strong\>.*?\"email\"[^\>]*\>([^\@]+\@[^\@]+\.[a-z]{2,6})\<.*?\<\/tr\>/msi", $this->Result, $matches, PREG_SET_ORDER); $connections = array(); foreach($matches as $match) { array_push($connections, array( 'id' => $match[1], 'name' => HTMLParser::StripLinks($match[2]), 'email' => $match[3] )); } return $connections; }
/** * Return array contains formated XHTML string * created from the responded HTML of the given URL. * array[code] => HTTP status code * array[headers] => HTTP headers * array[headers] => formated XHTML string made from the entity body * Throw exception if error. * * @param string $url * @param integer $cache_lifetime * @param boolean $conditional_request * @param array $headers * @param array $post * @return array */ public final function getXhtml($url, $cache_lifetime = 0, $conditional_request = false, $headers = array(), $post = array()) { /* * \x21\x23-\x3b\x3d\x3f-\x5a\x5c\x5f\x61-\x7a\x7c\x7e */ if (!preg_match('/^https?:\\/\\/\\w[\\w\\-\\.]+/i', $url)) { throw new Exception("Not a valid or fully qualified HTTP URL."); } $data = false; $cache_lifetime = (int) $cache_lifetime; $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; if ($use_cache) { $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); $params = array(); foreach ($headers as $key => $value) { if (!empty($value)) { $params[] = urlencode($key) . '=' . urlencode($value); } } foreach ($post as $key => $value) { $params[] = urlencode($key) . '=' . urlencode($value); } $cache_id = "{$url}?" . implode('&', $params); if (false !== ($data = $cache->get($cache_id))) { $data = unserialize($data); } } /* * Access to the URL if not cached * or if the cache has either Last-Modified or Etag header * and conditional request is specified. */ if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { $conditional_request = false; } if (!$data or $conditional_request) { if (isset($data['headers']['last-modified']) and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { $headers['last-modified'] = $data['headers']['last-modified']; } if (isset($data['headers']['etag']) and (!isset($headers['etag']) or empty($headers['etag']))) { $headers['etag'] = $data['headers']['etag']; } try { $response = $this->getHttpResponse($url, $headers, $post); } catch (Exception $e) { if (!$data) { throw $e; } } /* * Use cache if the responded HTTP status code is 304. * If 200, format the responded HTML of the given URL to XHTML. */ if (!$data or isset($response['code']) and $response['code'] != 304) { $data =& $response; /* * If status code was 200 and Content-Type was not (X)HTML, * the status code was forcibly altered to 204. * @see HTTP_Request_Listener_Extended->update(). */ if ($data['code'] != 200 and $data['code'] != 204) { throw new Exception("Responded HTTP Status Code is {$data['code']}."); } elseif (isset($data['headers']['content-type']) and !preg_match('/^(?:text|application)\\/x?html\\b/', $data['headers']['content-type'])) { throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); } elseif (empty($data['body'])) { throw new Exception("Responded entity body is empty."); } elseif (!preg_match('/<\\w+[^>]*?>/', $data['body'], $matches)) { throw new Exception("Responded entity body does not contain a markup symbol."); } elseif (false !== strpos($matches[0], "")) { throw new Exception("Responded entity body contains NULL."); } /* * Remove BOM and NULLs. */ $data['body'] = preg_replace('/^\\xef\\xbb\\xbf/', '', $data['body']); $data['body'] = str_replace("", '', $data['body']); /* * Initialize the backups. */ $this->backup = array(); $this->backup_count = 0; /* * Removing SCRIPT and STYLE is recommended. * The following substitute code will capsulate the content of the tags in CDATA. * If use it, be sure that some JavaScript method such as document.write * is not compliant with XHTML/XML. */ $tags = array('script', 'style'); foreach ($tags as $tag) { $data['body'] = preg_replace("/<{$tag}\\b[^>]*?>.*?<\\/{$tag}\\b[^>]*?>/si", '', $data['body']); /* $data['body'] = preg_replace_callback( "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", create_function('$matches', ' $content = trim($matches[2]); if (empty($content) or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { return $matches[0]; } else { $content = preg_replace("/^<!-+/", "", $content); $content = preg_replace("/-+>$/", "", $content); $content = preg_replace("/\s*\/\/$/s", "", trim($content)); return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; } '), $data['body'] ); */ } /* * Backup CDATA sections for later process. */ $data['body'] = preg_replace_callback('/<!\\[CDATA\\[.*?\\]\\]>/s', array($this, 'backup'), $data['body']); /* * Comment section must not contain two or more adjacent hyphens. */ $data['body'] = preg_replace_callback('/<!--(.*?)-->/si', create_function('$matches', ' return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; '), $data['body']); /* * Backup comment sections for later process. */ $data['body'] = preg_replace_callback('/<!--.*?-->/s', array($this, 'backup'), $data['body']); /* * Process tags that is potentially dangerous for XML parsers. */ $data['body'] = preg_replace_callback('/(<textarea\\b[^>]*?>)(.*?)(<\\/textarea\\b[^>]*?>)/si', create_function('$matches', ' return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; '), $data['body']); $data['body'] = preg_replace_callback('/<xmp\\b[^>]*?>(.*?)<\\/xmp\\b[^>]*?>/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); $data['body'] = preg_replace_callback('/<plaintext\\b[^>]*?>(.*)$/si', create_function('$matches', ' return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; '), $data['body']); /* * Remove DTD declarations, wrongly placed comments etc. * This must be done before removing DOCTYPE. */ $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); /* * XML and DOCTYPE declaration will be replaced. */ $data['body'] = preg_replace('/<!DOCTYPE\\b[^>]*?>/si', '', $data['body']); $data['body'] = preg_replace('/<\\?xml\\b[^>]*?\\?>/si', '', $data['body']); if (preg_match('/^\\s*$/s', $data['body'])) { throw new Exception('The entity body became empty after preprocessing.'); } /* * Detect character encoding and convert to UTF-8. */ $encoding = false; if (isset($data['headers']['content-type'])) { $encoding = $this->getCharsetFromCType($data['headers']['content-type']); } if (!$encoding and preg_match_all('/<meta\\b[^>]*?>/si', $data['body'], $matches)) { foreach ($matches[0] as $value) { if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' and false !== ($encoding = $this->getAttribute('content', $value))) { $encoding = $this->getCharsetFromCType($encoding); break; } } } /* * Use mbstring to convert character encoding if available. * Otherwise use iconv (iconv may try to detect character encoding automatically). * Do not trust the declared encoding and do conversion even if UTF-8. */ if (extension_loaded('mbstring')) { if (!$encoding) { @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); if (false === ($encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body'])))) { throw new Exception('Failed detecting character encoding.'); } } @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); } else { if (false === ($data['body'] = @iconv($encoding, 'UTF-8', $data['body']))) { throw new Exception('Failed converting character encoding.'); } foreach ($this->backup as $key => $value) { if (false === ($this->backup[$key] = @iconv($encoding, 'UTF-8', $value))) { throw new Exception('Failed converting character encoding.'); } } } /* * Restore CDATAs and comments. */ for ($i = 0; $i < $this->backup_count; $i++) { $data['body'] = str_replace("<restore count=\"{$i}\" />", $this->backup[$i], $data['body']); } /* * Use Tidy to format HTML if available. * Otherwise, use HTMLParser class (is slower and consumes much memory). */ if (extension_loaded('tidy')) { $tidy = new tidy(); $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); $tidy->cleanRepair(); $data['body'] = $tidy->html(); } else { require_once 'HTMLParser.class.php'; $parser = new HTMLParser(); $format_rule = (require 'xhtml1-transitional_dtd.inc.php'); $parser->setRule($format_rule); $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); $parser->setGenericParent('body'); $parser->parse($data['body']); $data['body'] = $parser->dump(); } /* * Valid XHTML DOCTYPE declaration (with DTD URI) is required * for SimpleXMLElement->asXML() method to produce proper XHTML tags. */ $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; $data['body'] = "{$declarations}{$data['body']}"; if ($use_cache) { $cache->save(serialize($data), $cache_id); } } } return $data; }
/** * Find differences between multiline strings and return formatted new * string * * @param string $string_old Old string * @param string $string_new New string * @return string New formatted string * @uses HTMLParser HTML Parser method StripTags * @access public */ public function GetHighlitedDiff($string_old, $string_new) { $string_new = HTMLParser::StripTags($string_new); $string_old = HTMLParser::StripTags($string_old); $patch = $this->Diff($string_old, $string_new); if ($patch) { $patch = preg_replace($this->Statements, $this->Replacements, $patch); $string_new = $this->Patch($string_old, $patch); } return $string_new; }
/** * Change writer type to convert to another format * * @param string $filetype Set the filetype of the file which will be written (XML/CSV/TSV/HTML/JSON) */ public function convertTo($filetype) { $this->constructWriter($filetype); $this->writer->setData($this->parser->getField()); }
public function getIdFromSource($RunID) { //Get Source $curl = curl_init("http://acm.hdu.edu.cn/viewcode.php?rid=" . $RunID); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookie_file); $src = curl_exec($curl); $th = new HTMLParser(); $th->loadHTML($src); return $th->innerHTML('//<ID>', '</ID>'); }
/** * Getting the price match result * * @return array */ private function getPrices() { $result = array(); $priceMatchResults = HTMLParser::getPriceListForProduct($this->base_url, $this->sku); foreach ($priceMatchResults as $priceMatchResult) { if (($companyDetails = trim($priceMatchResult['companyDetails'])) === '') { continue; } $companyDetailsArray = explode('|', $companyDetails); $companyURL = isset($companyDetailsArray[count($companyDetailsArray) - 2]) ? trim($companyDetailsArray[count($companyDetailsArray) - 2]) : trim($companyDetails); $companyURL = strtolower($companyURL); $companyURL = str_replace('https://', '', $companyURL); $companyURL = str_replace('http://', '', $companyURL); $name = isset($companyDetailsArray[count($companyDetailsArray) - 3]) ? trim($companyDetailsArray[count($companyDetailsArray) - 3]) : trim($companyDetails); $price = str_replace(' ', '', str_replace('$', '', str_replace(',', '', $priceMatchResult['price']))); $url = HTMLParser::getHostUrl($this->base_url) . $priceMatchResult['priceLink']; foreach (PriceMatchCompany::getAll() as $company) { if ($companyURL === strtolower($company->getCompanyAlias())) { $result[] = array('PriceMatchCompany' => $company, 'price' => $price, 'name' => $name, 'url' => $url); if ($this->debug === true) { echo $company->getCompanyName() . '(id=' . $company->getId() . "), \$" . $price . "\n"; } } } } return $result; }
/** * Parse personal details from content * * @return array profile details * sample * [userpic] => http://www.livejournal.com/userpic/38353247/8981002 * [name] => Natalie * [website] => http://www.myspace.com/aggressiva * [city] => La Verne * [state] => California * [country] => United States * [birthday] => 1971-12-15 * [aboutme] => party ... I just believe in parties! */ function GetPersonalDetails() { $details = array(); $pattern = '/><[0-9]*b>(user|name|website|location|birthdate|gizmo\/lj talk|bio|e\-mail)\:<[0-9]*\/b><[0-9]*\/td><([0-9]*)td[^>]*>(.*?)<\\2\/td>/msi'; $this->Result = HTMLParser::AddTagDepth($this->Result); preg_match_all($pattern, $this->Result, $matches, PREG_SET_ORDER); $this->Result = HTMLParser::RemoveTagDepth($this->Result); foreach($matches as $match) { $match[2] = HTMLParser::RemoveTagDepth(trim($match[3])); switch ($match[1]) { case 'Name': $details['name'] = $match[2]; break; case 'Website': if (preg_match("/href=(\'|\")(.*?)\\1/", $match[2], $match2)) { $details['website'] = $match2[2]; } break; case 'Location': preg_match_all("/loc\_(ci|st|cn)\=[^>\&]+>(.*?)</msi", $match[2], $match2, PREG_SET_ORDER); foreach($match2 as $res) { if ($res[1] == 'ci') $details['city'] = $res[2]; elseif ($res[1] == 'st') $details['state'] = $res[2]; elseif ($res[1] == 'cn') $details['country'] = $res[2]; } break; case 'Bio': $details['aboutme'] = $match[2]; break; case 'Birthdate': $details['birthday'] = $match[2]; break; case 'E-mail': $details['email'] = HTMLParser::StripTags($match[2]); break; } } if (preg_match("/http\:\/\/([a-z0-9\-]+\.)+[a-z0-9]{2,6}\/userpic\/[0-9]+\/[0-9]+/msi", $this->Result, $matches)) { $details['userpic'] = $matches[0]; } //echo '<xmp>'; print_r($details); echo '</xmp>'; exit; return $details; }
public function getIdFromSource($RunID) { $cookie_file = tempnam("./cookie", "cookie"); $this->cookie_file = $cookie_file; $login_url = "http://acm.hdu.edu.cn/userloginex.php?action=login"; $post_fields = "username="******"&userpass="******"&login=Sign In"; //Login $curl = curl_init($login_url); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file); curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, $post_fields); $this->data = curl_exec($curl); //Get Source $curl = curl_init("http://acm.hdu.edu.cn/viewcode.php?rid=" . $RunID); curl_setopt($curl, CURLOPT_HEADER, 0); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookie_file); $src = curl_exec($curl); $th = new HTMLParser(); $th->loadHTML($src); return $th->innerHTML('//<ID>', '</ID>'); }
} } } } public function getHeader() { return $this->pageHeader; } public function getHtmlUrl() { return $this->htmlurl; } public function getImageHolder() { return $this->imageHolder; } public function getBodyHolder() { return $this->bodyHolder; } } $htmlParser = new HTMLParser($_POST["cnnurl"]); // echo $object->getHtmlUrl(); echo $htmlParser->getHeader(); $imagePlaceHolder = $htmlParser->getImageHolder(); echo $imagePlaceHolder[0]; // Saw that only the first position image is relavent in most of the articles. Hence using only 0 instead of the for loop. $bodyHolder = $htmlParser->getBodyHolder(); foreach ($bodyHolder as $content) { echo $content; }