protected function _prepareDomainsTable() { $table = new Table(); $table->addClass('table', 'table-striped', 'tablesorter'); $row = new TableRow(); $row->setContent([TableHeading::create('Domain'), TableHeading::create('Price'), TableHeading::create('Source'), TableHeading::create('Month'), TableHeading::create('Year'), TableHeading::create('Status')]); $thead = new TableHead(); $thead->setContent($row); $tbody = new TableBody(); foreach ($this->_domains as $item) { $month = $item->month; $domain = strtolower(trim($item->domain, '')); if (is_numeric($item->month)) { $dateObj = \DateTime::createFromFormat('!m', (int) ltrim($item->month, 0)); $month = $dateObj->format('F'); } $whois = Span::create('Check')->addClass('btn', 'btn-success', 'btn-xs'); $whois->setAttribute('data-sendretrieve', '/domains/whois/' . $domain); $domainLink = new Link('http://www.' . $domain, $domain); $row = TableRow::create(); $row->appendContent([TableCell::create($domainLink), TableCell::create(Encoding::toUTF8($item->price)), TableCell::create($this->_filterSource($item->source)), TableCell::create($month), TableCell::create($item->year), TableCell::create($whois)]); $tbody->appendContent($row); } $table->prependContent($thead); $table->appendContent($tbody); return Div::create([GoogleAdsense::leaderboard(), $table]); }
/** * @return bool */ public function hasNext() { if (count($this->_rowBuffer) >= DBIterator::RECORD_BUFFER) { return true; } else { if (is_null($this->_rs)) { return count($this->_rowBuffer) > 0; } else { if ($row = $this->_rs->fetch(PDO::FETCH_ASSOC)) { foreach ($row as $key => $value) { if (is_null($value)) { $row[$key] = ""; } elseif (is_object($value)) { $row[$key] = "[OBJECT]"; } else { $row[$key] = Encoding::toUTF8($value); } } $sr = new SingleRow($row); // Enfileira o registo array_push($this->_rowBuffer, $sr); // Traz novos até encher o Buffer if (count($this->_rowBuffer) < DBIterator::RECORD_BUFFER) { $this->hasNext(); } return true; } else { $this->_rs->closeCursor(); $this->_rs = null; return count($this->_rowBuffer) > 0; } } } }
protected function execute(InputInterface $input, OutputInterface $output) { //get formatter $formatter = $this->getHelper('formatter'); $file = $input->getArgument('csv-file'); //check if file exists if (!file_exists($file)) { $output->writeln($formatter->formatBlock(['[Error]', 'File ' . $file . ' not found.'], 'error', true)); return; } //get options $delimiter = $input->getOption('delimiter'); $enclosure = $input->getOption('enclosure'); $escape = $input->getOption('escape'); //get content and fix encoding $content = file_get_contents($file); //remove windows line breaks $content = str_replace("\r", '', $content); //split lines $lines = explode("\n", $content); //output $columns = []; //loop over all lines and put them into columns foreach ($lines as $line) { $csv = str_getcsv($line, $delimiter, $enclosure, $escape); foreach ($csv as $key => $field) { $field = trim($field); $columns[$key][] = $field; } } $rows = []; //loop over columns foreach ($columns as $columnKey => $column) { //row id $rowId = 0; //get max strlen $max = StringUtil::maxStrlen($column) + 1; //make columns equal length foreach ($column as $fieldKey => $field) { $rows[$rowId][$columnKey] = ' ' . str_pad($field, $max, ' ', STR_PAD_RIGHT); $rowId++; } } $table = ''; //loop over rows foreach ($rows as $row) { $line = '|' . implode(' | ', $row) . '|'; if ($table == '') { $headers = []; foreach ($row as $field) { $headers[] = str_repeat('-', strlen($field)); } $line .= "\n|" . implode(' | ', $headers) . '|'; } $table .= $line . "\n"; } echo Encoding::toUTF8($table); }
function clean($s) { // TODO Replace with preg_replace $s = str_replace('“', '"', $s); $s = str_replace('”', '"', $s); $s = str_replace('‘', '\'', $s); $s = str_replace('’', '\'', $s); $s = str_replace('–', '-', $s); $s = str_replace('—', '-', $s); //by the way these are 2 different dashes $s = str_replace('…', '...', $s); $s = trim(htmlspecialchars(Encoding::toUTF8($s), ENT_QUOTES)); return $s; }
/** * Disabled temporary (test takes a lot of time) */ public function testEncDec() { $pc = new PayloadCypher(); $keyDir = __DIR__ . '/../../var/test/keys'; $dataDir = __DIR__ . '/../../var/test/data'; // Test different key sizes for ($x = 1; $x < 4; $x++) { $cmd = 'openssl genrsa -out ' . $keyDir . '/private.pem ' . $x * 1024; system($cmd); $cmd = 'openssl rsa -in ' . $keyDir . '/private.pem -outform PEM -pubout -out ' . $keyDir . '/public.pem'; system($cmd); $pc->setOnPublicKeyLoad(function () use($keyDir) { return array(rand(1000, 2000) => file_get_contents($keyDir . '/public.pem')); }); $pc->setOnPrivateKeyLoad(function ($keyName) use($keyDir) { return array($keyName => file_get_contents($keyDir . '/private.pem')); }); $utf8Text = file_get_contents($dataDir . '/utf8.txt'); for ($i = 1; $i < 10; $i++) { $payload = $utf8Text . openssl_random_pseudo_bytes(rand(1, 200)); $cypherText = $pc->encryptString($payload); $decrypted = $pc->decryptString($cypherText); $this->assertEquals($decrypted, $payload); } for ($i = 1; $i < 10; $i++) { $obj = new stdClass(); $obj->test0 = 'hallo'; $obj->test1 = true; $obj->test2 = 1.12 * $i; $obj->test3 = array(1, 2, 3); $obj->utf8 = $utf8Text; // Entry to reproduce UTF8 encoding bug like: // http://stackoverflow.com/questions/10205722/json-encode-invalid-utf-8-sequence-in-argument $obj->invalidUtf8 = Encoding::toUTF8(pack("H*", 'c32e')); $cypherText = $pc->objectToEncryptedJson($obj); $decrypted = $pc->encryptedJsonToObject($cypherText); $this->assertEquals($obj, $decrypted); } for ($i = 1; $i < 10; $i++) { $arr = array('test1' => 123, 'test2' => 10.1, 'utf8' => file_get_contents($dataDir . '/utf8.txt')); $cypherText = $pc->arrayToEncryptedJson($arr); $decrypted = $pc->encryptedJsonToArray($cypherText); $this->assertEquals($arr, $decrypted); } } unlink($keyDir . '/private.pem'); unlink($keyDir . '/public.pem'); }
public function loadXmlString($html) { // log output often uses garbled ISO-8859-1 and UTF-8 encodings $html = \ForceUTF8\Encoding::toUTF8($html); // fix invalid markup of outdated ViewVC versions // - help link in footer not terminated // - selected branch/tag in CVS "sticky tag" dropdown has not attribute value // - self closing elements with no trailing slash // - remove navheader because of its often incomplete form tags $html = str_replace('Help</strong></td>', 'Help</a></strong></td>', $html); $html = str_replace('selected>', 'selected="selected">', $html); $html = preg_replace('#<((?:input|br|hr|img)[^\\/\\>]*)>#', '<$1 />', $html); $html = preg_replace('#\\<div class\\=\\"vc_navheader\\"\\>.*?\\<\\/div\\>#s', '', $html); // replace named HTML entities with their UTF-8 value $html = str_replace(array_values($this->entities), array_keys($this->entities), $html); // clean up namespace declaration $html = str_replace('xmlns="', 'ns="', $html); return new SimpleXMLElement($html); }
function clean($s) { if (is_array($s)) { foreach ($s as $key => $val) { $s[$key] = clean($s[$key]); } return $s; } // TODO Replace with preg_replace $s = str_replace('“', '"', $s); $s = str_replace('”', '"', $s); $s = str_replace('‘', '\'', $s); $s = str_replace('’', '\'', $s); $s = str_replace('–', '-', $s); $s = str_replace('—', '-', $s); //by the way these are 2 different dashes $s = str_replace('…', '...', $s); $s = trim(htmlspecialchars(Encoding::toUTF8($s), ENT_QUOTES)); return $s; }
/** * Formats an array in preparation for dispatch to a STOMP queue * * @return array Pass this return array to STOMP :) * * TODO: Stop saying "STOMP". */ protected function getStompTransaction() { $transaction = array('gateway_txn_id' => $this->getTransactionGatewayTxnID(), 'response' => $this->getTransactionMessage(), 'correlation-id' => $this->getCorrelationID(), 'php-message-class' => 'SmashPig\\CrmLink\\Messages\\DonationInterfaceMessage', 'gateway_account' => $this->account_name); // Add the rest of the relevant data // FIXME: This is "normalized" data. We should refer to it as such, // and rename the getData_Unstaged_Escaped function. $stomp_data = array_intersect_key($this->getData_Unstaged_Escaped(), array_flip($this->dataObj->getMessageFields())); // The order here is important, values in $transaction are considered more definitive // in case the transaction already had keys with those values $transaction = array_merge($stomp_data, $transaction); // FIXME: Note that we're not using any existing date or ts fields. Why is that? $transaction['date'] = time(); // Force any incorrect encoding to UTF-8. // FIXME: Move down to the PHP-Queue library $transaction = Encoding::toUTF8($transaction); return $transaction; }
$html_link = '<a href="%s" rel="nofollow" target="_blank" data-type="%s" data-toggle="tooltip" title="%s">%s</a>'; $link_formated = sprintf($html_link, $link['expanded_url'], $link['type'], $link['expanded_url'], $link['url']); $message_html = str_replace($link['url'], $link_formated, $message_html); } } if (!empty($medias)) { foreach ($medias as $media) { $html_media = '<a href="%s" rel="nofollow" target="_blank" data-type="%s" data-toggle="tooltip" title="%s">%s</a>'; $media_formated = sprintf($html_media, $media['media_url'], $media['type'], $media['media_url'], $media['url']); $message_html = str_replace($media['url'], $media_formated, $message_html); } } $provider = 'TWITTER'; $ref_id = $result->id_str; $author = $result->user->screen_name; $message = \ForceUTF8\Encoding::toUTF8($result->text); $message_html = $message_html; $avatar = $result->user->profile_image_url; $links = !empty($links) ? json_encode($links) : ""; $medias = json_encode($medias); $ctime = date('Y-m-d H:i:s', strtotime($result->created_at)); $ctime_db = date('Y-m-d H:i:s', strtotime($result->created_at) - date("Z")); $visible = $config['modo_type']; try { $db->beginTransaction(); $q = $db->prepare('INSERT INTO messages (provider,ref_id,author,message,message_html,avatar,links,medias,ctime,visible) VALUES(?,?,?,?,?,?,?,?,?,?)'); $q->execute(array($provider, $ref_id, $author, $message, $message_html, $avatar, $links, $medias, $ctime_db, $visible)); $lastId = $db->lastInsertId(); $db->commit(); } catch (PDOException $e) { echo "DB Error : " . $e->errorInfo();
public function post_content($value) { $value = \ForceUTF8\Encoding::toUTF8($value); $value = wpautop($value); return $value; }
/** * Handle action related to mime type detection. * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..). * * @param array $mimeInfo From getMimeActionInfo() function * @param string $effective_url Current content url * @param string $body Content from the response * * @return array|null */ private function handleMimeAction($mimeInfo, $effective_url, $body = '') { if (!isset($mimeInfo['action'])) { return; } $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => array()); switch ($mimeInfo['action']) { case 'exclude': throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effective_url)); case 'link': $infos['html'] = '<a href="' . $effective_url . '">Download ' . $mimeInfo['name'] . '</a>'; if ($mimeInfo['type'] == 'image') { $infos['html'] = '<a href="' . $effective_url . '"><img src="' . $effective_url . '" alt="' . $mimeInfo['name'] . '" /></a>'; } if ($mimeInfo['mime'] == 'application/pdf') { $parser = new PdfParser(); $pdf = $parser->parseFile($effective_url); $html = Encoding::toUTF8(nl2br($pdf->getText())); // strip away unwanted chars (that usualy came from PDF extracted content) // @see http://www.phpwact.org/php/i18n/charsets#common_problem_areas_with_utf-8 $html = preg_replace('/[^\\x{0009}\\x{000a}\\x{000d}\\x{0020}-\\x{D7FF}\\x{E000}-\\x{FFFD}]+/u', ' ', $html); $infos['html'] = $html; // update title in case of details are present $details = $pdf->getDetails(); // Title can be a string or an array with one key if (isset($details['Title'])) { if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) { $infos['title'] = $details['Title'][0]; } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) { $infos['title'] = $details['Title']; } } } if ($mimeInfo['mime'] == 'text/plain') { $infos['html'] = '<pre>' . $body . '</pre>'; } return $infos; } return; }
require_once 'smswall.inc.php'; include 'func.php'; include 'libs/ForceUTF8/Encoding.php'; date_default_timezone_set('Europe/Paris'); // Affichage par défaut des 30 derniers messages stockés en bdd $offset = isset($_POST['offset']) ? $_POST['offset'] : 0; $limit = isset($_POST['limit']) ? $_POST['limit'] : 30; $result = $db->query("SELECT * FROM messages ORDER BY ctime DESC LIMIT " . $offset . "," . $limit); $rowarray = $result->fetchall(PDO::FETCH_ASSOC); $response = array(); foreach ($rowarray as $row) { $msg = array(); foreach ($row as $key => $value) { if ($key == "ctime") { $timestamp = strtotime($value) + date("Z"); $value = date("Y-m-d H:i:s", $timestamp); } // définition des avatars par défaut en fonction du provider (SMS, WWW) if ($key == "avatar" && !$value && $row['provider'] != 'TWITTER') { $value = 'default_' . strtolower($row['provider']) . '.png'; } if ($key == "message" || $key == "message_html") { $value = \ForceUTF8\Encoding::toUTF8($value); } $msg[$key] = $value; } $response[] = $msg; } header('Content-type: application/json'); echo json_encode($response);
/** * MSSQL won't handle UTF8 properly */ public function convertEncoding($row) { foreach ($row as $k => $value) { if (is_string($value)) { $row[$k] = Encoding::toUTF8($value); } } return $row; }
/** * @param $element * @return $this */ public function setDataFromElement($element) { $this->data = null; $this->id = new Data\Id($element); $this->fullPath = $element->getRealFullPath(); $this->creationDate = $element->getCreationDate(); $this->modificationDate = $element->getModificationDate(); $this->userModification = $element->getUserModification(); $this->userOwner = $element->getUserOwner(); $this->type = $element->getType(); if ($element instanceof Object\Concrete) { $this->subtype = $element->getClassName(); } else { $this->subtype = $this->type; } $this->properties = ""; $properties = $element->getProperties(); if (is_array($properties)) { foreach ($properties as $nextProperty) { $pData = (string) $nextProperty->getData(); if ($nextProperty->getName() == "bool") { $pData = $pData ? "true" : "false"; } $this->properties .= $nextProperty->getName() . ":" . $pData . " "; } } $this->data = ""; if ($element instanceof Document) { if ($element instanceof Document\Folder) { $this->data = $element->getKey(); $this->published = true; } elseif ($element instanceof Document\Link) { $this->published = $element->isPublished(); $this->data = $element->getTitle() . " " . $element->getHref(); } elseif ($element instanceof Document\PageSnippet) { $this->published = $element->isPublished(); $elements = $element->getElements(); if (is_array($elements) && !empty($elements)) { foreach ($elements as $tag) { if ($tag instanceof Document\Tag\TagInterface) { ob_start(); $this->data .= strip_tags($tag->frontend()) . " "; $this->data .= ob_get_clean(); } } } if ($element instanceof Document\Page) { $this->published = $element->isPublished(); $this->data .= " " . $element->getTitle() . " " . $element->getDescription() . " " . $element->getPrettyUrl(); } } } elseif ($element instanceof Asset) { $this->data = $element->getFilename(); $elementMetadata = $element->getMetadata(); if (is_array($elementMetadata)) { foreach ($elementMetadata as $md) { if (is_scalar($md['data'])) { $this->data .= " " . $md["name"] . ":" . $md["data"]; } } } if ($element instanceof Asset\Document && \Pimcore\Document::isAvailable()) { if (\Pimcore\Document::isFileTypeSupported($element->getFilename())) { try { $contentText = $element->getText(); $contentText = Encoding::toUTF8($contentText); $contentText = str_replace(["\r\n", "\r", "\n", "\t", "\f"], " ", $contentText); $contentText = preg_replace("/[ ]+/", " ", $contentText); $this->data .= " " . $contentText; } catch (\Exception $e) { Logger::error($e); } } } elseif ($element instanceof Asset\Text) { try { $contentText = $element->getData(); $contentText = Encoding::toUTF8($contentText); $this->data .= " " . $contentText; } catch (\Exception $e) { Logger::error($e); } } elseif ($element instanceof Asset\Image) { try { $metaData = array_merge($element->getEXIFData(), $element->getIPTCData()); foreach ($metaData as $key => $value) { $this->data .= " " . $key . " : " . $value; } } catch (\Exception $e) { Logger::error($e); } } $this->published = true; } elseif ($element instanceof Object\AbstractObject) { if ($element instanceof Object\Concrete) { $getInheritedValues = Object\AbstractObject::doGetInheritedValues(); Object\AbstractObject::setGetInheritedValues(true); $this->published = $element->isPublished(); foreach ($element->getClass()->getFieldDefinitions() as $key => $value) { $this->data .= $value->getDataForSearchIndex($element) . " "; } Object\AbstractObject::setGetInheritedValues($getInheritedValues); } elseif ($element instanceof Object\Folder) { $this->data = $element->getKey(); $this->published = true; } } else { Logger::crit("Search\\Backend\\Data received an unknown element!"); } if ($element instanceof Element\ElementInterface) { $this->data = "ID: " . $element->getId() . " \nPath: " . $this->getFullPath() . " \n" . $this->cleanupData($this->data); } return $this; }
$enc = mb_detect_encoding($text, mb_list_encodings(), true); //overwrite ASCII if ($enc === false) { $enc = "UNKWN"; $textToSend = $text; $saveToOpen = false; } elseif ($enc !== "UTF-8") { //$text = mb_convert_encoding($text, 'UTF-8', $enc); /* if(function_exists('incov')) { $text = incov($enc,'UTF-8',$text); } else { */ $textToSend = \ForceUTF8\Encoding::fixUTF8(\ForceUTF8\Encoding::toUTF8($text)); $test = mb_convert_encoding($textToSend, $enc, 'UTF-8'); if ($test != $text) { $saveToOpen = false; } else { $saveToOpen = true; } //} } else { $textToSend = $text; $saveToOpen = true; } if ($enc === 'ASCII' && mb_check_encoding($text, 'UTF-8')) { $enc = 'UTF-8'; } $json = new \stdClass();
<?php require_once '../smswall.inc.php'; include '../func.php'; require '../libs/Pusher.php'; include '../libs/ForceUTF8/Encoding.php'; $provider = 'WWW'; $author = $_POST['pseudo']; $message = \ForceUTF8\Encoding::toUTF8($_POST['message']); $ctime = date('Y-m-d H:i:s', time()); $ctime_db = date('Y-m-d H:i:s', time() - date("Z")); $visible = $config['modo_type']; try { $db->beginTransaction(); $q = $db->prepare('INSERT INTO messages (provider,author,message,ctime,visible) VALUES(?,?,?,?,?)'); $q->execute(array($provider, $author, $message, $ctime_db, $visible)); $lastId = $db->lastInsertId(); $db->commit(); } catch (PDOException $e) { echo "Erreur PDO : " . $e->errorInfo(); } // Préparation du dict pour le trigger Pusher 'new_twut' $arrayPush['id'] = $lastId; $arrayPush['provider'] = $provider; // $arrayPush['message'] = utf8_encode($message); $arrayPush['message'] = $message; $arrayPush['message_html'] = make_clickable($arrayPush['message']); $arrayPush['internallink'] = $arrayPush['message'] !== $arrayPush['message_html'] ? true : false; $arrayPush['visible'] = $visible; $arrayPush['author'] = $author; $arrayPush['avatar'] = 'default_www.png';
protected function _encodePrice($funkyPrice) { $price = str_replace(['$', '£', '€', ','], '', Encoding::toUTF8($funkyPrice)); if (is_numeric($price)) { $price = (int) $price; } return $price; }
private function _encodeUTF8($params) { array_walk_recursive($params, function (&$item, $key) { $item = Encoding::toUTF8($item); }); return $params; }
<?php return; //Deprecated //快速的注册帐号,需要至少一个名字和邮箱 //XSS $name = \ForceUTF8\Encoding::toUTF8($_REQUEST['name']); $email = \ForceUTF8\Encoding::toUTF8($_REQUEST['email']); $password = \ForceUTF8\Encoding::toUTF8($_REQUEST['password']); $wpError = new WP_Error(); if (filter_var($email, FILTER_VALIDATE_EMAIL)) { $wpError->add("invalid_email_format", \My\CustomError::getValue("invalid_email_format")); } if (mb_strlen($name, 'UTF-8') < 3 || mb_strlen($name, 'UTF-8') > 20) { $wpError->add("invalid_username_length", \My\CustomError::getValue("invalid_username_length")); } if (mb_strlen($password, 'UTF-8') < 3 || mb_strlen($password, 'UTF-8') > 20) { $wpError->add("invalid_password_length", \My\CustomError::getValue("invalid_password_length")); } // const existing_user_login = "******"; // const existing_user_email = "邮箱已经被占用"; // const invalid_email_format = "无效邮箱地址"; // const invalid_username_length = "名字长度必须在3-20内"; // const invalid_password_length = "密码长度必须在3-20内"; // const invalid_email_length = "邮箱长度必须在3-20内"; //是否有问题 $errorArray = $wpError->get_error_codes(); if (count($errorArray) == 0) { //里面有wp_slash $wpFooResult = wp_create_user($username, $password, $email); //把WP内置的错误放进$wpError
$author = "SMS"; // POST : SMS Enabler // GET : Android Tasker // Décommenter les //authors pour afficher n° de téléphone if (!empty($_POST['text'])) { $content = $_POST['text']; //$author = $_POST['sender']; } else { if (!empty($_GET['text'])) { $content = urldecode($_GET['text']); //$author = $_GET['sender']; } } $provider = 'SMS'; // $message = strip_tags(utf8_decode($content)); $message = \ForceUTF8\Encoding::toUTF8(strip_tags($content)); $ctime = date('Y-m-d H:i:s', time()); $ctime_db = date('Y-m-d H:i:s', time() - date("Z")); $visible = $config['modo_type']; try { $db->beginTransaction(); $q = $db->prepare('INSERT INTO messages (provider,author,message,ctime,visible) VALUES(?,?,?,?,?)'); $q->execute(array($provider, $author, $message, $ctime_db, $visible)); $lastId = $db->lastInsertId(); $db->commit(); } catch (PDOException $e) { echo "Erreur : " . $e->errorInfo(); } // Préparation du dict pour le trigger Pusher 'new_twut' $arrayPush['id'] = $lastId; $arrayPush['message'] = $message;
function createShittalkRow($text) { if (strlen($text) > 128) { $text = substr($text, 0, 128); //shorten to source game default length } $text_to_utf8 = Encoding::toUTF8($text); $text = Encoding::fixUTF8($text_to_utf8); $text_escaped = mysql_escape_mimic(strip_double_quotes($text)); $today = mysql_escape_mimic(date("Y-m-d H:i:s")); $sql = "INSERT INTO `shittalkDB`\n (`text`, `date_created`, `custom`)\n VALUES ('{$text_escaped}', '{$today}', 1);"; $result = mySqlQuery($sql); return $result; }
function sanitizeElement($elem) { $test = false; $elem = str_replace("`", "'", $elem); $elem = str_replace("‘", "'", $elem); if ($test) { echo '-3) ' . json_encode($elem) . "\n"; } $elem = str_replace("’", "'", $elem); if ($test) { echo '-2) ' . json_encode($elem) . "\n"; } $elem = str_replace("“", "\"", $elem); if ($test) { echo '-1) ' . json_encode($elem) . "\n"; } $elem = str_replace("”", "\"", $elem); if ($test) { echo '0) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'na' ? '' : $elem; if ($test) { echo '1) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'n/a' ? '' : $elem; if ($test) { echo '2) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'test' ? '' : $elem; if ($test) { echo '3) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'void' ? '' : $elem; if ($test) { echo '4) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'test - void' ? '' : $elem; if ($test) { echo '5) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'tba' ? '' : $elem; if ($test) { echo '6) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'tbd' ? '' : $elem; if ($test) { echo '7) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == 'unknown' ? '' : $elem; if ($test) { echo '8) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == '1900-01-01' ? '' : $elem; if ($test) { echo '9) ' . json_encode($elem) . "\n"; } $elem = strtolower($elem) == '1970-01-01' ? '' : $elem; if ($test) { echo '10) ' . json_encode($elem) . "\n"; } $elem = preg_replace("/[\r]+/", "", $elem); if ($test) { echo '11) ' . json_encode($elem) . "\n"; } $elem = isHTML($elem) ? preg_replace("/[\n]+/", "", $elem) : preg_replace("/[\n]/", "<br>", $elem); if ($test) { echo '12) ' . json_encode($elem) . "\n"; } $elem = preg_replace('/[\\x00-\\x1F\\x80-\\xFF]/', '', $elem); // removed non-UTF8 chartacters if ($test) { echo '13) ' . json_encode($elem) . "\n"; } $elem = str_replace(' ', ' ', $elem); // removed html space if ($test) { echo '14) ' . json_encode($elem) . "\n"; } $elem = preg_replace('!\\s+!', ' ', $elem); // removed redundand spaces if ($test) { echo '15) ' . json_encode($elem) . "\n"; } $elem = preg_replace('/(<br[\\s]?[\\/]?>[\\s]*){3,}/', '<br /><br />', $elem); // replace redundant <br>, space ... if ($test) { echo '16) ' . json_encode($elem) . "\n"; } $elem = preg_replace('/<br[\\s]?[\\/]?>[\\s]*$/', '', $elem); // removed br from end post --> if ($test) { echo '17) ' . json_encode($elem) . "\n"; } $elem = preg_replace('/<img[^>]+\\>/i', '', $elem); // remove all image tags if ($test) { echo '18) ' . json_encode($elem) . "\n"; } $elem = str_replace('�', ' ', $elem); // removed html placeholder if ($test) { echo '19) ' . json_encode($elem) . "\n"; } $elem = str_replace('', '', $elem); // removed html placeholder if ($test) { echo '20) ' . json_encode($elem) . "\n"; } $elem = str_replace('<p></p>', '', $elem); // removed html placeholder if ($test) { echo '21) ' . json_encode($elem) . "\n"; } $elem = Encoding::toUTF8($elem); // fixes broken UTF8 characters if ($test) { echo '23) ' . json_encode($elem) . "\n"; } $elem = str_replace(" dont", " don't", $elem); // grammar 1 if ($test) { echo '24) ' . json_encode($elem) . "\n"; } $elem = str_replace(" doesnt ", " doesn't ", $elem); // grammar 2 if ($test) { echo '25) ' . json_encode($elem) . "\n"; } $elem = str_replace(" im ", " i'm ", $elem); // grammar 3 if ($test) { echo '26) ' . json_encode($elem) . "\n"; } $elem = trim(trim($elem)); if ($test) { echo '29) ' . json_encode($elem) . "\n"; } return $elem; }
/** * Do fetch content from an url. * * @param string $url * * @return array With key html, url & title */ private function doFetchContent($url) { // Check for feed URL $url = trim($url); if (strtolower(substr($url, 0, 7)) == 'feed://') { $url = 'http://' . substr($url, 7); } if (!preg_match('!^https?://.+!i', $url)) { $url = 'http://' . $url; } if (false === filter_var($url, FILTER_VALIDATE_URL)) { throw new \Exception(sprintf('Url "%s" is not valid.', $url)); } $url = filter_var($url, FILTER_SANITIZE_URL); if (false === $this->isUrlAllowed($url)) { throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $url)); } $response = $this->httpClient->fetch($url); $effective_url = $response['effective_url']; if (!$this->isUrlAllowed($effective_url)) { throw new \Exception(sprintf('Url "%s" is not allowed to be parsed.', $effective_url)); } // check if action defined for returned Content-Type, like image, pdf, audio or video $mimeInfo = $this->getMimeActionInfo($response['headers']); $infos = $this->handleMimeAction($mimeInfo, $effective_url, $response['body']); if (is_array($infos)) { return $infos; } $html = Encoding::toUTF8($response['body']); $ogData = $this->extractOpenGraph($html); // @TODO: log raw html + headers // check site config for single page URL - fetch it if found $is_single_page = false; if ($this->config['singlepage'] && ($single_page_response = $this->getSinglePage($html, $effective_url))) { $is_single_page = true; $effective_url = $single_page_response['effective_url']; // check if action defined for returned Content-Type $mimeInfo = $this->getMimeActionInfo($single_page_response['headers']); $infos = $this->handleMimeAction($mimeInfo, $effective_url, $single_page_response['body']); if (is_array($infos)) { return $infos; } $html = Encoding::toUTF8($single_page_response['body']); $this->logger->log('debug', "Retrieved single-page view from {$effective_url}"); unset($single_page_response); } $this->logger->log('debug', '--------'); $this->logger->log('debug', 'Attempting to extract content'); $extract_result = $this->extractor->process($html, $effective_url); $readability = $this->extractor->readability; // if user has asked to see parsed HTML, show it and exit. // @TODO: log parsed HTML // $readability->dom->saveXML($readability->dom->documentElement) $content_block = $this->extractor->getContent(); $extracted_title = $this->extractor->getTitle(); $extracted_language = $this->extractor->getLanguage(); // Deal with multi-page articles //die('Next: '.$this->extractor->getNextPageUrl()); $is_multi_page = !$is_single_page && $extract_result && null !== $this->extractor->getNextPageUrl(); if ($this->config['multipage'] && $is_multi_page) { $this->logger->log('debug', '--------'); $this->logger->log('debug', 'Attempting to process multi-page article'); $multi_page_urls = array(); $multi_page_content = array(); while ($next_page_url = $this->extractor->getNextPageUrl()) { $this->logger->log('debug', '--------'); $this->logger->log('debug', 'Processing next page: ' . $next_page_url); // If we've got URL, resolve against $url $next_page_url = $this->makeAbsoluteStr($effective_url, $next_page_url); if (!$next_page_url) { $this->logger->log('debug', 'Failed to resolve against ' . $effective_url); $multi_page_content = array(); break; } // check it's not what we have already! if (in_array($next_page_url, $multi_page_urls)) { $this->logger->log('debug', 'URL already processed'); $multi_page_content = array(); break; } // it's not, so let's attempt to fetch it $multi_page_urls[] = $next_page_url; $response = $this->httpClient->fetch($url); // make sure mime type is not something with a different action associated $mimeInfo = $this->getMimeActionInfo($response['headers']); if (isset($mimeInfo['action'])) { $this->logger->log('debug', 'MIME type requires different action'); $multi_page_content = array(); break; } $extracSuccess = $this->extractor->process(Encoding::toUTF8($response['body']), $next_page_url); if (!$extracSuccess) { $this->logger->log('debug', 'Failed to extract content'); $multi_page_content = array(); break; } $multi_page_content[] = $this->extractor->getContent(); } // did we successfully deal with this multi-page article? if (empty($multi_page_content)) { $this->logger->log('debug', 'Failed to extract all parts of multi-page article, so not going to include them'); $_page = $readability->dom->createElement('p'); $_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; $multi_page_content[] = $_page; } foreach ($multi_page_content as $_page) { $_page = $content_block->ownerDocument->importNode($_page, true); $content_block->appendChild($_page); } unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page); } // if we failed to extract content... if (!$extract_result || null === $content_block) { return array('status' => $response['status'], 'html' => $this->config['error_message'], 'title' => $extracted_title, 'language' => $extracted_language, 'url' => $effective_url, 'content_type' => isset($mimeInfo['mime']) ? $mimeInfo['mime'] : '', 'open_graph' => $ogData); } $readability->clean($content_block, 'select'); if ($this->config['rewrite_relative_urls']) { $this->makeAbsolute($effective_url, $content_block); } // footnotes if ($this->config['content_links'] == 'footnotes' && strpos($effective_url, 'wikipedia.org') === false) { $readability->addFootnotes($content_block); } // normalise $content_block->normalize(); // remove empty text nodes foreach ($content_block->childNodes as $_n) { if ($_n->nodeType === XML_TEXT_NODE && trim($_n->textContent) == '') { $content_block->removeChild($_n); } } // remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p> while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { // only follow these tag names if (!in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) { break; } $content_block = $content_block->firstChild; } // convert content block to HTML string // Need to preserve things like body: //img[@id='feature'] if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer', 'li', 'td'))) { $html = $content_block->innerHTML; } else { $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML } unset($content_block); // post-processing cleanup $html = preg_replace('!<p>[\\s\\h\\v]*</p>!u', '', $html); if ($this->config['content_links'] == 'remove') { $html = preg_replace('!</?a[^>]*>!', '', $html); } $this->logger->log('debug', 'Done!'); return array('status' => $response['status'], 'html' => $html, 'title' => $extracted_title, 'language' => $extracted_language, 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => $ogData); }
/** * * @param string $s * @return string */ protected function CustomEncode($s) { return \ForceUTF8\Encoding::toUTF8($s); //return \ForceUTF8\Encoding::fixUTF8(($s)); }
/** * Handle action related to mime type detection. * These action can be exclude or link to handle custom content (like image, video, pdf, etc ..). * * @param array $mimeInfo From getMimeActionInfo() function * @param string $effective_url Current content url * @param string $body Content from the response * * @return array|null */ private function handleMimeAction($mimeInfo, $effective_url, $body = '') { if (!isset($mimeInfo['action'])) { return; } $infos = array('status' => 200, 'title' => $mimeInfo['name'], 'language' => '', 'html' => '', 'url' => $effective_url, 'content_type' => $mimeInfo['mime'], 'open_graph' => array()); switch ($mimeInfo['action']) { case 'exclude': throw new \Exception(sprintf('This is url "%s" is blocked by mime action.', $effective_url)); case 'link': $infos['html'] = '<a href="' . $effective_url . '">Download ' . $mimeInfo['name'] . '</a>'; if ($mimeInfo['type'] == 'image') { $infos['html'] = '<a href="' . $effective_url . '"><img src="' . $effective_url . '" alt="' . $mimeInfo['name'] . '" /></a>'; } if ($mimeInfo['mime'] == 'application/pdf') { $parser = new PdfParser(); $pdf = $parser->parseFile($effective_url); $infos['html'] = Encoding::toUTF8(nl2br($pdf->getText())); // update title in case of details are present $details = $pdf->getDetails(); // Title can be a string or an array with one key if (isset($details['Title'])) { if (is_array($details['Title']) && isset($details['Title'][0]) && '' !== trim($details['Title'][0])) { $infos['title'] = $details['Title'][0]; } elseif (is_string($details['Title']) && '' !== trim($details['Title'])) { $infos['title'] = $details['Title']; } } } if ($mimeInfo['mime'] == 'text/plain') { $infos['html'] = '<pre>' . $body . '</pre>'; } return $infos; } return; }
function test_encoding_of_arrays() { $arr1 = array(file_get_contents(dirname(__FILE__) . "/data/test1Latin.txt"), file_get_contents(dirname(__FILE__) . "/data/test1.txt"), file_get_contents(dirname(__FILE__) . "/data/test1Latin.txt")); $arr2 = array(file_get_contents(dirname(__FILE__) . "/data/test1.txt"), file_get_contents(dirname(__FILE__) . "/data/test1.txt"), file_get_contents(dirname(__FILE__) . "/data/test1.txt")); return Encoding::toUTF8($arr1) == $arr2; }
/** * Force UTF8 using toUTF8 function from neitanod/forceutf8. This function * expects that original charset is ISO-8859-1, LATIN-1 ou WIN-1252. * * @param mixed | array | string $string * @return mixed */ public static function forceUTF8($string) { return \ForceUTF8\Encoding::toUTF8($string); }
/** * @param mixed $input string or array * * @return mixed */ public function filter($input) { // The Encoding methods are array-aware so we can just drop our input // into the conversion method. return Encoding::toUTF8($input); }
/** * @return array */ public function getIPTCData() { $data = []; if (is_file($this->getFileSystemPath())) { $result = getimagesize($this->getFileSystemPath(), $info); if ($result) { $mapping = ['2#105' => 'headline', '2#120' => 'caption', '2#092' => 'location', '2#090' => 'city', '2#095' => 'state', '2#101' => 'country', '2#100' => 'countryCode', '2#080' => 'photographerName', '2#110' => 'credit', '2#085' => 'photographerTitle', '2#115' => 'source', '2#116' => 'copyright', '2#005' => 'objectName', '2#122' => 'captionWriters', '2#040' => 'instructions', '2#015' => 'category', '2#020' => 'supplementalCategories', '2#103' => 'transmissionReference', '2#010' => 'urgency', '2#025' => 'keywords', '2#055' => 'date', '2#060' => 'time']; if ($info && isset($info['APP13'])) { $iptcRaw = iptcparse($info['APP13']); if (is_array($iptcRaw)) { foreach ($iptcRaw as $key => $value) { if (is_array($value) && count($value) === 1) { $value = $value[0]; } if (isset($mapping[$key])) { $data[$mapping[$key]] = \ForceUTF8\Encoding::toUTF8($value); } } } } } } return $data; }