/** * Examines the page at $Url for title, description & images. Be sure to check the resultant array for any Exceptions that occurred while retrieving the page. * @param string $Url The url to examine. * @param integer $Timeout How long to allow for this request. Default Garden.SocketTimeout or 1, 0 to never timeout. Default is 0. * @return array an array containing Url, Title, Description, Images (array) and Exception (if there were problems retrieving the page). */ function FetchPageInfo($Url, $Timeout = 3) { $PageInfo = array('Url' => $Url, 'Title' => '', 'Description' => '', 'Images' => array(), 'Exception' => FALSE); try { if (!defined('HDOM_TYPE_ELEMENT')) { require_once PATH_LIBRARY . '/vendors/simplehtmldom/simple_html_dom.php'; } $PageHtml = ProxyRequest($Url, $Timeout, TRUE); $Dom = str_get_html($PageHtml); if (!$Dom) { throw new Exception('Failed to load page for parsing.'); } /* Sample Facebook Open Graph code: <meta property="og:title" content="60 degrees in February" /> <meta property="og:url" content="http://karinemily.wordpress.com/2012/02/02/60-degrees-in-february/" /> <meta property="og:description" content="and Philadelphia explodes with babies, puppies, and hipsters." /> <meta property="og:site_name" content="K a r i ' s" /> <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0132.jpg?w=300&h=300" /> <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0214.jpg?w=300&h=300" /> <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0213.jpg?w=300&h=300" /> <meta property="og:image" content="http://karinemily.files.wordpress.com/2012/02/dsc_0221-version-2.jpg?w=300&h=300" /> */ // FIRST PASS: Look for open graph title, desc, images $PageInfo['Title'] = DomGetContent($Dom, 'meta[property=og:title]'); Trace('Getting og:description'); $PageInfo['Description'] = DomGetContent($Dom, 'meta[property=og:description]'); foreach ($Dom->find('meta[property=og:image]') as $Image) { if (isset($Image->content)) { $PageInfo['Images'][] = $Image->content; } } // SECOND PASS: Look in the page for title, desc, images if ($PageInfo['Title'] == '') { $PageInfo['Title'] = $Dom->find('title', 0)->plaintext; } if ($PageInfo['Description'] == '') { Trace('Getting meta description'); $PageInfo['Description'] = DomGetContent($Dom, 'meta[name=description]'); } // THIRD PASS: Look in the page contents if ($PageInfo['Description'] == '') { foreach ($Dom->find('p') as $element) { Trace('Looking at p for description.'); if (strlen($element->plaintext) > 150) { $PageInfo['Description'] = $element->plaintext; break; } } if (strlen($PageInfo['Description']) > 400) { $PageInfo['Description'] = SliceParagraph($PageInfo['Description'], 400); } } // Final: Still nothing? remove limitations if ($PageInfo['Description'] == '') { foreach ($Dom->find('p') as $element) { Trace('Looking at p for description (no restrictions)'); if (trim($element->plaintext) != '') { $PageInfo['Description'] = $element->plaintext; break; } } } // Page Images if (count($PageInfo['Images']) == 0) { $Images = DomGetImages($Dom, $Url); $PageInfo['Images'] = array_values($Images); } $PageInfo['Title'] = HtmlEntityDecode($PageInfo['Title']); $PageInfo['Description'] = HtmlEntityDecode($PageInfo['Description']); } catch (Exception $ex) { $PageInfo['Exception'] = $ex->getMessage(); } return $PageInfo; }
/** * Remove html entities from a column in the database. * * @param string $Table The name of the table. * @param array $Column The column to decode. * @param int $Limit The number of records to work on. */ public function HtmlEntityDecode($Table, $Column, $Limit = 100) { // Construct a model to save the results. $Model = $this->CreateModel($Table); // Get the data to decode. $Data = $this->SQL->Select($Model->PrimaryKey)->Select($Column)->From($Table)->Like($Column, '&%;', 'both')->Limit($Limit)->Get()->ResultArray(); $Result = array(); $Result['Count'] = count($Data); $Result['Complete'] = FALSE; $Result['Decoded'] = array(); $Result['NotDecoded'] = array(); // Loop through each row in the working set and decode the values. foreach ($Data as $Row) { $Value = $Row[$Column]; $DecodedValue = HtmlEntityDecode($Value); $Item = array('From' => $Value, 'To' => $DecodedValue); if ($Value != $DecodedValue) { $Model->SetField($Row[$Model->PrimaryKey], $Column, $DecodedValue); $Result['Decoded'] = $Item; } else { $Result['NotDecoded'] = $Item; } } $Result['Complete'] = $Result['Count'] < $Limit; return $Result; }