/** * Examine a page at {@link $Url} for title, description & images. * * Be sure to check the resultant array for any Exceptions that occurred while retrieving the page. * * @param string $url The url to examine. * @param integer $timeout How long to allow for this request. * Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0. * @param bool $sendCookies Whether or not to send browser cookies with the request. * @return array Returns an array containing Url, Title, Description, Images (array) and Exception * (if there were problems retrieving the page). */ function fetchPageInfo($url, $timeout = 3, $sendCookies = false) { $PageInfo = array('Url' => $url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => false); try { // Make sure the URL is valid. $urlParts = parse_url($url); if ($urlParts === false || !in_array(val('scheme', $urlParts), array('http', 'https'))) { throw new Exception('Invalid URL.', 400); } if (!defined('HDOM_TYPE_ELEMENT')) { require_once PATH_LIBRARY . '/vendors/simplehtmldom/simple_html_dom.php'; } $Request = new ProxyRequest(); $PageHtml = $Request->Request(array('URL' => $url, 'Timeout' => $timeout, 'Cookies' => $sendCookies)); if (!$Request->status()) { throw new Exception('Couldn\'t connect to host.', 400); } $Dom = str_get_html($PageHtml); if (!$Dom) { throw new Exception('Failed to load page for parsing.'); } // FIRST PASS: Look for open graph title, desc, images $PageInfo['Title'] = domGetContent($Dom, 'meta[property=og:title]'); Trace('Getting og:description'); $PageInfo['Description'] = domGetContent($Dom, 'meta[property=og:description]'); foreach ($Dom->find('meta[property=og:image]') as $Image) { if (isset($Image->content)) { $PageInfo['Images'][] = $Image->content; } } // SECOND PASS: Look in the page for title, desc, images if ($PageInfo['Title'] == '') { $PageInfo['Title'] = $Dom->find('title', 0)->plaintext; } if ($PageInfo['Description'] == '') { Trace('Getting meta description'); $PageInfo['Description'] = domGetContent($Dom, 'meta[name=description]'); } // THIRD PASS: Look in the page contents if ($PageInfo['Description'] == '') { foreach ($Dom->find('p') as $element) { Trace('Looking at p for description.'); if (strlen($element->plaintext) > 150) { $PageInfo['Description'] = $element->plaintext; break; } } if (strlen($PageInfo['Description']) > 400) { $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400); } } // Final: Still nothing? remove limitations if ($PageInfo['Description'] == '') { foreach ($Dom->find('p') as $element) { Trace('Looking at p for description (no restrictions)'); if (trim($element->plaintext) != '') { $PageInfo['Description'] = $element->plaintext; break; } } } // Page Images if (count($PageInfo['Images']) == 0) { $Images = domGetImages($Dom, $url); $PageInfo['Images'] = array_values($Images); } $PageInfo['Title'] = htmlEntityDecode($PageInfo['Title']); $PageInfo['Description'] = htmlEntityDecode($PageInfo['Description']); } catch (Exception $ex) { $PageInfo['Exception'] = $ex->getMessage(); } return $PageInfo; }
/** * Examine a page at {@link $Url} for title, description & images. * * Be sure to check the resultant array for any Exceptions that occurred while retrieving the page. * * @param string $url The url to examine. * @param integer $timeout How long to allow for this request. * Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0. * @param bool $sendCookies Whether or not to send browser cookies with the request. * @return array Returns an array containing Url, Title, Description, Images (array) and Exception * (if there were problems retrieving the page). */ function fetchPageInfo($url, $timeout = 3, $sendCookies = false) { $PageInfo = array('Url' => $url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => false); try { // Make sure the URL is valid. $urlParts = parse_url($url); if ($urlParts === false || !in_array(val('scheme', $urlParts), array('http', 'https'))) { throw new Exception('Invalid URL.', 400); } $Request = new ProxyRequest(); $PageHtml = $Request->Request(array('URL' => $url, 'Timeout' => $timeout, 'Cookies' => $sendCookies, 'Redirects' => true)); if (!$Request->status()) { throw new Exception('Couldn\'t connect to host.', 400); } $Dom = pQuery::parseStr($PageHtml); if (!$Dom) { throw new Exception('Failed to load page for parsing.'); } // FIRST PASS: Look for open graph title, desc, images $PageInfo['Title'] = domGetContent($Dom, 'meta[property="og:title"]'); Trace('Getting og:description'); $PageInfo['Description'] = domGetContent($Dom, 'meta[property="og:description"]'); foreach ($Dom->query('meta[property="og:image"]') as $Image) { if ($Image->attr('content')) { $PageInfo['Images'][] = $Image->attr('content'); } } // SECOND PASS: Look in the page for title, desc, images if ($PageInfo['Title'] == '') { $PageInfo['Title'] = $Dom->query('title')->text(); } if ($PageInfo['Description'] == '') { Trace('Getting meta description'); $PageInfo['Description'] = domGetContent($Dom, 'meta[name="description"]'); } // THIRD PASS: Look in the page contents if ($PageInfo['Description'] == '') { foreach ($Dom->query('p') as $element) { Trace('Looking at p for description.'); if (strlen($element->plaintext) > 150) { $PageInfo['Description'] = $element->text(); break; } } if (strlen($PageInfo['Description']) > 400) { $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400); } } // Final: Still nothing? remove limitations if ($PageInfo['Description'] == '') { foreach ($Dom->query('p') as $element) { Trace('Looking at p for description (no restrictions)'); if (trim($element->text()) != '') { $PageInfo['Description'] = $element->text(); break; } } } // Page Images if (count($PageInfo['Images']) == 0) { $Images = domGetImages($Dom, $url); $PageInfo['Images'] = array_values($Images); } $PageInfo['Title'] = htmlEntityDecode($PageInfo['Title']); $PageInfo['Description'] = htmlEntityDecode($PageInfo['Description']); } catch (Exception $ex) { $PageInfo['Exception'] = $ex->getMessage(); } return $PageInfo; }