Пример #1
0
 /**
  * Examines the page at $Url for title, description & images. Be sure to check the resultant array for any Exceptions that occurred while retrieving the page. 
  * @param string $Url The url to examine.
  * @param integer $Timeout How long to allow for this request. Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0.
  * @return array an array containing Url, Title, Description, Images (array) and Exception (if there were problems retrieving the page).
  */
 function FetchPageInfo($Url, $Timeout = 3)
 {
     $PageInfo = array('Url' => $Url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => FALSE);
     try {
         if (!defined('HDOM_TYPE_ELEMENT')) {
             require_once PATH_LIBRARY . '/vendors/simplehtmldom/simple_html_dom.php';
         }
         $PageHtml = ProxyRequest($Url, $Timeout, TRUE);
         $Dom = str_get_html($PageHtml);
         if (!$Dom) {
             throw new Exception('Failed to load page for parsing.');
         }
         /* Sample Facebook Open Graph code:
         
         <meta property="og:title" content="60 degrees in&nbsp;February" />
         <meta property="og:url" content="http://karinemily.wordpress.com/2012/02/02/60-degrees-in-february/" />
         <meta property="og:description" content="and Philadelphia explodes with babies, puppies, and hipsters." />
         <meta property="og:site_name" content="K a r i &#039; s" />
         <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0132.jpg?w=300&amp;h=300" />
         <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0214.jpg?w=300&amp;h=300" />
         <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0213.jpg?w=300&amp;h=300" />
         <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0221-version-2.jpg?w=300&amp;h=300" />
         
                   */
         // FIRST PASS: Look for open graph title, desc, images
         $PageInfo['Title'] = DomGetContent($Dom, 'meta[property=og:title]');
         Trace('Getting og:description');
         $PageInfo['Description'] = DomGetContent($Dom, 'meta[property=og:description]');
         foreach ($Dom->find('meta[property=og:image]') as $Image) {
             if (isset($Image->content)) {
                 $PageInfo['Images'][] = $Image->content;
             }
         }
         // SECOND PASS: Look in the page for title, desc, images
         if ($PageInfo['Title'] == '') {
             $PageInfo['Title'] = $Dom->find('title', 0)->plaintext;
         }
         if ($PageInfo['Description'] == '') {
             Trace('Getting meta description');
             $PageInfo['Description'] = DomGetContent($Dom, 'meta[name=description]');
         }
         // THIRD PASS: Look in the page contents
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->find('p') as $element) {
                 Trace('Looking at p for description.');
                 if (strlen($element->plaintext) > 150) {
                     $PageInfo['Description'] = $element->plaintext;
                     break;
                 }
             }
             if (strlen($PageInfo['Description']) > 400) {
                 $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400);
             }
         }
         // Final: Still nothing? remove limitations
         if ($PageInfo['Description'] == '') {
             foreach ($Dom->find('p') as $element) {
                 Trace('Looking at p for description (no restrictions)');
                 if (trim($element->plaintext) != '') {
                     $PageInfo['Description'] = $element->plaintext;
                     break;
                 }
             }
         }
         // Page Images
         if (count($PageInfo['Images']) == 0) {
             $Images = DomGetImages($Dom, $Url);
             $PageInfo['Images'] = array_values($Images);
         }
         $PageInfo['Title'] = HtmlEntityDecode($PageInfo['Title']);
         $PageInfo['Description'] = HtmlEntityDecode($PageInfo['Description']);
     } catch (Exception $ex) {
         $PageInfo['Exception'] = $ex->getMessage();
     }
     return $PageInfo;
 }
Пример #2
0
 /**
  * Remove html entities from a column in the database.
  * 
  * @param string $Table The name of the table.
  * @param array $Column The column to decode.
  * @param int $Limit The number of records to work on.
  */
 public function HtmlEntityDecode($Table, $Column, $Limit = 100)
 {
     // Construct a model to save the results.
     $Model = $this->CreateModel($Table);
     // Get the data to decode.
     $Data = $this->SQL->Select($Model->PrimaryKey)->Select($Column)->From($Table)->Like($Column, '&%;', 'both')->Limit($Limit)->Get()->ResultArray();
     $Result = array();
     $Result['Count'] = count($Data);
     $Result['Complete'] = FALSE;
     $Result['Decoded'] = array();
     $Result['NotDecoded'] = array();
     // Loop through each row in the working set and decode the values.
     foreach ($Data as $Row) {
         $Value = $Row[$Column];
         $DecodedValue = HtmlEntityDecode($Value);
         $Item = array('From' => $Value, 'To' => $DecodedValue);
         if ($Value != $DecodedValue) {
             $Model->SetField($Row[$Model->PrimaryKey], $Column, $DecodedValue);
             $Result['Decoded'] = $Item;
         } else {
             $Result['NotDecoded'] = $Item;
         }
     }
     $Result['Complete'] = $Result['Count'] < $Limit;
     return $Result;
 }