Example #1
0
 /**
  * Examine a page at {@link $Url} for title, description & images.
  *
  * Be sure to check the resultant array for any Exceptions that occurred while retrieving the page.
  *
  * @param string $url The url to examine.
  * @param integer $timeout How long to allow for this request.
  * Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0.
  * @param bool $sendCookies Whether or not to send browser cookies with the request.
  * @return array Returns an array containing Url, Title, Description, Images (array) and Exception
  * (if there were problems retrieving the page).
  */
 function fetchPageInfo($url, $timeout = 3, $sendCookies = false)
 {
     $PageInfo = array('Url' => $url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => false);
     try {
         // Make sure the URL is valid.
         $urlParts = parse_url($url);
         if ($urlParts === false || !in_array(val('scheme', $urlParts), array('http', 'https'))) {
             throw new Exception('Invalid URL.', 400);
         }
         if (!defined('HDOM_TYPE_ELEMENT')) {
             require_once PATH_LIBRARY . '/vendors/simplehtmldom/simple_html_dom.php';
         }
         $Request = new ProxyRequest();
         $PageHtml = $Request->Request(array('URL' => $url, 'Timeout' => $timeout, 'Cookies' => $sendCookies));
         if (!$Request->status()) {
             throw new Exception('Couldn\'t connect to host.', 400);
         }
         $Dom = str_get_html($PageHtml);
         if (!$Dom) {
             throw new Exception('Failed to load page for parsing.');
         }
         // FIRST PASS: Look for open graph title, desc, images
         $PageInfo['Title'] = domGetContent($Dom, 'meta[property=og:title]');
         Trace('Getting og:description');
         $PageInfo['Description'] = domGetContent($Dom, 'meta[property=og:description]');
         foreach ($Dom->find('meta[property=og:image]') as $Image) {
             if (isset($Image->content)) {
                 $PageInfo['Images'][] = $Image->content;
             }
         }
         // SECOND PASS: Look in the page for title, desc, images
         if ($PageInfo['Title'] == '') {
             $PageInfo['Title'] = $Dom->find('title', 0)->plaintext;
         }
         if ($PageInfo['Description'] == '') {
             Trace('Getting meta description');
             $PageInfo['Description'] = domGetContent($Dom, 'meta[name=description]');
         }
         // THIRD PASS: Look in the page contents
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->find('p') as $element) {
                 Trace('Looking at p for description.');
                 if (strlen($element->plaintext) > 150) {
                     $PageInfo['Description'] = $element->plaintext;
                     break;
                 }
             }
             if (strlen($PageInfo['Description']) > 400) {
                 $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400);
             }
         }
         // Final: Still nothing? remove limitations
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->find('p') as $element) {
                 Trace('Looking at p for description (no restrictions)');
                 if (trim($element->plaintext) != '') {
                     $PageInfo['Description'] = $element->plaintext;
                     break;
                 }
             }
         }
         // Page Images
         if (count($PageInfo['Images']) == 0) {
             $Images = domGetImages($Dom, $url);
             $PageInfo['Images'] = array_values($Images);
         }
         $PageInfo['Title'] = htmlEntityDecode($PageInfo['Title']);
         $PageInfo['Description'] = htmlEntityDecode($PageInfo['Description']);
     } catch (Exception $ex) {
         $PageInfo['Exception'] = $ex->getMessage();
     }
     return $PageInfo;
 }
Example #2
0
 /**
  * Examine a page at {@link $Url} for title, description & images.
  *
  * Be sure to check the resultant array for any Exceptions that occurred while retrieving the page.
  *
  * @param string $url The url to examine.
  * @param integer $timeout How long to allow for this request.
  * Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0.
  * @param bool $sendCookies Whether or not to send browser cookies with the request.
  * @return array Returns an array containing Url, Title, Description, Images (array) and Exception
  * (if there were problems retrieving the page).
  */
 function fetchPageInfo($url, $timeout = 3, $sendCookies = false)
 {
     $PageInfo = array('Url' => $url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => false);
     try {
         // Make sure the URL is valid.
         $urlParts = parse_url($url);
         if ($urlParts === false || !in_array(val('scheme', $urlParts), array('http', 'https'))) {
             throw new Exception('Invalid URL.', 400);
         }
         $Request = new ProxyRequest();
         $PageHtml = $Request->Request(array('URL' => $url, 'Timeout' => $timeout, 'Cookies' => $sendCookies, 'Redirects' => true));
         if (!$Request->status()) {
             throw new Exception('Couldn\'t connect to host.', 400);
         }
         $Dom = pQuery::parseStr($PageHtml);
         if (!$Dom) {
             throw new Exception('Failed to load page for parsing.');
         }
         // FIRST PASS: Look for open graph title, desc, images
         $PageInfo['Title'] = domGetContent($Dom, 'meta[property="og:title"]');
         Trace('Getting og:description');
         $PageInfo['Description'] = domGetContent($Dom, 'meta[property="og:description"]');
         foreach ($Dom->query('meta[property="og:image"]') as $Image) {
             if ($Image->attr('content')) {
                 $PageInfo['Images'][] = $Image->attr('content');
             }
         }
         // SECOND PASS: Look in the page for title, desc, images
         if ($PageInfo['Title'] == '') {
             $PageInfo['Title'] = $Dom->query('title')->text();
         }
         if ($PageInfo['Description'] == '') {
             Trace('Getting meta description');
             $PageInfo['Description'] = domGetContent($Dom, 'meta[name="description"]');
         }
         // THIRD PASS: Look in the page contents
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->query('p') as $element) {
                 Trace('Looking at p for description.');
                 if (strlen($element->plaintext) > 150) {
                     $PageInfo['Description'] = $element->text();
                     break;
                 }
             }
             if (strlen($PageInfo['Description']) > 400) {
                 $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400);
             }
         }
         // Final: Still nothing? remove limitations
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->query('p') as $element) {
                 Trace('Looking at p for description (no restrictions)');
                 if (trim($element->text()) != '') {
                     $PageInfo['Description'] = $element->text();
                     break;
                 }
             }
         }
         // Page Images
         if (count($PageInfo['Images']) == 0) {
             $Images = domGetImages($Dom, $url);
             $PageInfo['Images'] = array_values($Images);
         }
         $PageInfo['Title'] = htmlEntityDecode($PageInfo['Title']);
         $PageInfo['Description'] = htmlEntityDecode($PageInfo['Description']);
     } catch (Exception $ex) {
         $PageInfo['Exception'] = $ex->getMessage();
     }
     return $PageInfo;
 }