/** * find dom node by css selector * * Paperg - allow us to specify that we want case insensitive testing of the value of the selector. * * @param $selector * @param null $idx * * @return array|null|\voku\helper\SimpleHtmlDomNode[]|\voku\helper\SimpleHtmlDomNode */ public function find($selector, $idx = null) { $find = $this->root->find($selector, $idx); if ($find === null) { return new SimpleHtmlDomNodeBlank(); } else { return $find; } }
protected function parseCharset() { $charset = null; if (function_exists('get_last_retrieve_url_contents_content_type')) { $contentTypeHeader = get_last_retrieve_url_contents_content_type(); $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); if ($success) { $charset = $matches[1]; } } if (empty($charset)) { $el = $this->root->find('meta[http-equiv=Content-Type]', 0); if (!empty($el)) { $fullValue = $el->getAttribute("content"); if (!empty($fullValue)) { $success = preg_match('/charset=(.+)/', $fullValue, $matches); if ($success) { $charset = $matches[1]; } else { // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 $charset = 'ISO-8859-1'; } } } } // If we couldn't find a charset above, then lets try to detect one based on the text we got... if (empty($charset)) { // Have php try to detect the encoding from the text given to us. $charset = mb_detect_encoding($this->root->text() . "ascii", $encoding_list = array("UTF-8", "CP1252")); // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... if ($charset === false) { $charset = 'UTF-8'; } } // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. if (strtolower($charset) == strtolower('ISO-8859-1') || strtolower($charset) == strtolower('Latin1') || strtolower($charset) == strtolower('Latin-1')) { $charset = 'CP1252'; } return $this->_charset = $charset; }