function get_username_list($content) { $dom = HTML5::loadHTML($content); $ret = array(); foreach ($dom->getElementsByTagName('a') as $key => $node) { $href = $node->getAttribute('href'); if (preg_match('%/people/(.+)$%', $href, $matches)) { $username = $matches[1]; $ret[$username] = $node->textContent; } } return $ret; }
$t = timer(); $avg = intval(get_average($t, 'user page')); echo "[{$code}]\t{$t} ms\tAvg: {$avg} ms\n"; if ($code == 404) { slog("user {$username} fetch fail, code {$code}"); User::updateByUserName($username, array('has_fetch' => true, 'fetch_fail' => true)); echo "没有这个用户 {$username}\n"; continue; } if ($code != 200) { slog("user {$username} fetch fail, code {$code}"); User::updateByUserName($username, array('has_fetch' => true, 'fetch_fail' => true)); echo "奇奇怪怪的返回码 {$code}\n"; continue; } $dom = HTML5::loadHTML($content); $dom = $dom->getElementById('zh-pm-page-wrap'); foreach ($dom->getElementsByTagName('img') as $key => $node) { if (($attr = $node->getAttribute('class')) == 'zm-profile-header-img zg-avatar-big zm-avatar-editor-preview') { $src = $node->getAttribute('src'); } } User::updateByUserName($username, array('avatar' => $src)); $link_list = get_answer_link_list($content); $rs = Answer::saveAnswer($base_url, $username, $link_list); $num = get_page_num($content); if ($num > 1) { foreach (range(2, $num) as $i) { echo "\nNo. {$n} fetch page {$i}\t"; $url_page = "{$url}?page={$i}"; timer();
public function extractStrings($html) { $dom = null; if ($this->useHtml5Parser) { $intro = substr($html, 0, 255); if (stripos($intro, '<!DOCTYPE html>') !== false) { // this is html5 so we'll use the html5 require_once 'lib/HTML5.php'; $options = new StdClass(); $options->decorateDocument = function (DOMDocument $dom) { $dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); }; $dom = HTML5::loadHTML($html, $options); // noscripts contents are treated like text which causes problems when // filters/replacements are run on them. Let's just remove them $noscripts = $dom->getElementsByTagName('noscript'); foreach ($noscripts as $noscript) { $noscript->parentNode->removeChild($noscript); } } } //$dom = str_get_html($html); if (!isset($dom)) { $dom = new DOMDocument(); $dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); @$dom->loadHtml('<?xml encoding="UTF-8">' . $html); // dirty fix foreach ($dom->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $dom->removeChild($item); } } // remove hack $dom->encoding = 'UTF-8'; // insert proper } //print_r($dom); $strings = array(); $this->strings =& $strings; $stringsIndex = array(); $xpath = new DOMXPath($dom); $this->translateDates($xpath); //$text = $xpath->query('//text()[normalize-space() and not(ancestor::script | ancestor::style)]'); //$translatables = $dom->find('[translate]'); $translateAttrs = $xpath->query('//*[@data-swete-translate-attrs or @alt or @title]'); $otherAtts = array('title', 'alt'); foreach ($translateAttrs as $el) { if ($el->hasAttribute('data-swete-translate-attrs')) { $attNames = explode(' ', $el->getAttribute('data-swete-translate-attrs')); } else { $attNames = array(); } foreach ($otherAtts as $attName) { if ($el->hasAttribute($attName)) { $attNames[] = $attName; } } foreach ($attNames as $attName) { $attVal = $el->getAttribute($attName); if ($attVal and trim($attVal)) { $index = count($strings); $strings[] = trim(_n($attVal)); $stringsIndex[trim(_n($attVal))] = $index; $el->setAttribute($attName, '{{$' . $index . '$}}'); $index++; } } } $translatables = $xpath->query('//*[@translate]'); foreach ($translatables as $tr) { $index = count($strings); //$strings[] = trim(_n($tr->innertext)); //$strings[] = trim(_n($tr->innerHTML)); $trStr = trim(_n($tr->innerHTML)); if ($tr->hasAttribute('data-swete-delimiters')) { $delim = trim($tr->getAttribute('data-swete-delimiters')); if ($delim) { $delimSplitter = $delim[0]; $delimiters = explode($delimSplitter, $delim); $delimiters2 = array(); foreach ($delimiters as $delimiterIdx => $delimiter) { if (!trim($delimiter)) { continue; } $delimiters2[] = '(' . preg_quote($delimiter, '/') . ')'; } $delimiters = $delimiters2; $pattern = '/' . implode('|', $delimiters) . '/'; $toks = preg_split($pattern, $trStr, -1, PREG_SPLIT_DELIM_CAPTURE); $innerHTML = array(); foreach ($toks as $tokIdx => $tok) { if (!trim($tok)) { $innerHTML[] = $tok; } else { if ($tokIdx % 2 === 1) { // It is a delimiter $innerHTML[] = $tok; } else { $strings[] = trim(_n($tok)); $stringsIndex[trim(_n($tok))] = $index; $innerHTML[] = '{{$' . $index . '$}}'; $index++; if ($tok[strlen($tok) - 1] === ' ') { $innerHTML[] = ' '; } } } } $tr->innerHTML = implode('', $innerHTML); $trStr = ''; } } if ($trStr) { $strings[] = trim(_n($trStr)); $stringsIndex[trim(_n($trStr))] = $index; $tr->innerHTML = '{{$' . $index . '$}}'; $index++; } $gchildren = $xpath->query('./text()', $tr); foreach ($gchildren as $gchild) { $gchild->isCovered = 1; } } //$untranslatables = $dom->find('[notranslate]'); $untranslatables = $xpath->query('//*[@notranslate]'); foreach ($untranslatables as $tr) { //error_log('Found untranslatable: '.$tr->outertext); //$gchildren = $tr->find('text'); $gchildren = $xpath->query('./text()', $tr); //error_log(count($gchildren).' found'); //foreach ($gchildren as $gchild) $gchild->isCovered = 1; foreach ($gchildren as $gchild) { $gchild->isCovered = 1; } } $textX = $xpath->query('//text()[not(ancestor::script | ancestor::style | ancestor::*[@notranslate] | ancestor::*[@translate])]'); $text = array(); foreach ($textX as $x) { $text[] = $x; } //echo "Found ".$text->length; foreach ($text as $tx) { if (!$tx instanceof DOMNode) { continue; } if (!isset($tx->parentNode)) { continue; } if (!$tx->parentNode instanceof DOMElement) { continue; } // the data-swete-translate is a little different than the notranslate attribute // the notranslate attribute confers block level status to its owner tag. // data-swete-translate simply marks a segment of text as not to be translated // (or to be translated) within the flow of the document. Therefore we don't // use a text node whose parent has the data-swete-translate as an anchor // to start building a group of text. But we will allow a tag with this // to be included in a group of text (that contains content before and/or after). // The SweteTools::encode() method will take care of variablizing the content // at translation time. if ($tx->parentNode->hasAttribute('data-swete-translate') and $tx->parentNode->getAttribute('data-swete-translate') === '0') { continue; } //if ( !trim($tx->innertext) ) continue; if (!trim($tx->nodeValue)) { continue; } //if ( in_array($tx->parent->tag , array('comment','script','style','code') )) continue; if (in_array(strtolower($tx->parentNode->tagName), array('comment', 'script', 'style', 'code'))) { continue; } if ($this->isCovered($tx)) { //echo "This one's covered!!!"; continue; } //echo "[".$tx->nodeValue."]"; //continue; $group = array(); $start = $tx; //if ( $tx->parent->children ){ if (!isset($tx->parentNode)) { //error_log("skipping ".$tx->nodeValue); continue; } if ($tx->parentNode->childNodes->length > 0) { $pos = -1; //foreach ( $tx->parent->nodes as $idx=>$child ){ foreach ($tx->parentNode->childNodes as $idx => $child) { if ($child === $tx) { $pos = $idx; break; } } $mypos = $pos; for ($i = $pos; $i >= 0; $i--) { //$node = $tx->parent->nodes[$i]; $node = $tx->parentNode->childNodes->item($i); //if ( $node->tag != 'text' and !in_array($node->tag, self::$inlineTags) ){ if ($node->nodeType != XML_TEXT_NODE and !in_array(strtolower(@$node->tagName), self::$inlineTags) and !($node instanceof DOMElement and $node->hasAttribute('data-swete-inline'))) { break; } //if ( $node->notranslate ){ if ($node instanceof DOMElement and $node->hasAttribute('notranslate')) { break; } if ($node instanceof DOMElement and $node->hasAttribute('data-swete-block')) { break; } $pos = $i; } //if ( $mypos == $pos or $this->isFirstText($tx->parent, $mypos, $pos)){ if ($mypos == $pos or $this->isFirstText($tx->parentNode, $mypos, $pos)) { $startIdx = $pos; //for ( $i=$startIdx; $i<count($tx->parent->nodes); $i++ ){ for ($i = $startIdx; $i < $tx->parentNode->childNodes->length; $i++) { //$node = $tx->parent->nodes[$i]; $node = $tx->parentNode->childNodes->item($i); if (!$node) { break; } //if ( $node->tag != 'text' and !in_array($node->tag, self::$inlineTags) ){ if ($node->nodeType != XML_TEXT_NODE and !in_array(strtolower(@$node->tagName), self::$inlineTags) and !($node instanceof DOMElement and $node->hasAttribute('data-swete-inline'))) { break; } //if ( $node->notranslate ){ if ($node instanceof DOMElement and $node->hasAttribute('notranslate')) { break; } if ($node instanceof DOMElement and $node->hasAttribute('data-swete-block')) { break; } //if ( $node->tag != 'text' ){ // if ( preg_match('/^<'.$node->tag.'[^>]*>/', $node->outertext, $matches) ){ // // $node->outertext = preg_replace('/^<'.$node->tag.'([^>]*)>/', '<'.$node->tag.' id="{{R'.count($this->replacements).'R}}">', $node->outertext); // $this->replacements[] = $matches[0]; // } // //} $group[] = $node; } } } else { $group[] = $tx; } $combinedText = array(); foreach ($group as $item) { //$combinedText[] = trim($item->outertext); // REquires PHP 5.3.6 or higher.. passing element to saveHtml() $combinedText[] = preg_replace_callback('#<(\\w+)([^>]*)\\s*/>#s', create_function('$m', ' $xhtml_tags = array("br", "hr", "input", "frame", "img", "area", "link", "col", "base", "basefont", "param"); return in_array($m[1], $xhtml_tags) ? "<$m[1]$m[2]/>" : "<$m[1]$m[2]></$m[1]>"; '), $dom->saveXml($item)); } //var_dump($combinedText); $combinedText = implode('', $combinedText); $leadingWhiteSpace = ''; $trailingWhiteSpace = ''; if (preg_match('#^[\\p{Z}\\s]+#', $combinedText, $m1)) { $leadingWhiteSpace = $m1[0]; } //echo 'Checking for trailing space: ['.$combinedText.']'."\n"; if (preg_match('#[\\p{Z}\\s]+$#', $combinedText, $m1)) { //echo "Trailing white space found in '$combinedText'\n"; $trailingWhiteSpace = $m1[0]; } else { //echo "No trailing whitespace found.".ord($combinedText{strlen($combinedText)-1}); } $combinedText = _n($this->replaceStrings($combinedText)); if (!trim(str_ireplace(' ', '', $combinedText))) { continue; } if (isset($stringsIndex[$combinedText])) { $index = $stringsIndex[$combinedText]; } else { $index = count($strings); $strings[] = $combinedText; $stringsIndex[$combinedText] = $index; } foreach ($group as $gnode) { //$gchildren = $gnode->find('text'); $gchildren = @$xpath->query('./text()', $gnode); if (!$gchildren) { continue; } foreach ($gchildren as $gchild) { $gchild->isCovered = 1; } } //$group[0]->outertext = '{{$'.$index.'$}}'; //$group[0]->nodeValue = '{{$'.$index.'$}}'; for ($i = 1; $i < count($group); $i++) { //$group[$i]->outertext = ''; //if ( !@$group[$i] ) continue; if (@$group[$i]->parentNode) { $group[$i]->parentNode->removeChild($group[$i]); } } if (!@$group[0]) { continue; } if (!@$group[0]->parentNode) { continue; } $textNodeContent = $leadingWhiteSpace . '{{$' . $index . '$}}' . $trailingWhiteSpace; //echo 'Content:['.$textNodeContent.']'."\n"; $group[0]->parentNode->replaceChild($dom->createTextNode($textNodeContent), $group[0]); } // Now we need to translate the keywords and the description //foreach ($dom->find('meta') as $el){ foreach ($xpath->query('//meta[@name="keywords" or @name="description"]') as $el) { //$content = _n($el->content); if (!$el->hasAttribute('content')) { continue; } $content = _n($el->getAttribute('content')); //if ( $content and in_array(strtolower(strval($el->name)), array('keywords','description')) ){ if (isset($stringsIndex[$content])) { $index = $stringsIndex[$content]; } else { $index = count($strings); $strings[] = $content; $stringsIndex[$content] = $index; } //$el->content = '{{$'.$index.'$}}'; $el->setAttribute('content', '{{$' . $index . '$}}'); //} } $this->strings = array_map(array($this, 'cleanString'), $this->strings); //return $dom->save(); return $dom->saveHtml(); }
/** * Create instance of Readability * @param string UTF-8 encoded string * @param string (optional) URL associated with HTML (used for footnotes) * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib') */ function __construct($html, $url = null, $parser = 'libxml') { $this->url = $url; /* Turn all double br's into p's */ $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); if (trim($html) == '') { $html = '<html></html>'; } if ($parser == 'html5lib' || $parser == 'html5php') { if (version_compare(PHP_VERSION, '5.3.0') >= 0) { $this->dom = HTML5::loadHTML($html); } } if ($this->dom === null) { $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; @$this->dom->loadHTML($html); } $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); }
static function loadHtml($html) { $intro = substr($html, 0, 255); if (self::$USE_HTML5_PARSER and stripos($intro, '<!DOCTYPE html>') !== false) { // this is html5 so we'll use the html5 require_once 'lib/HTML5.php'; $out = HTML5::loadHTML($html); // noscripts contents are treated like text which causes problems when // filters/replacements are run on them. Let's just remove them $noscripts = $out->getElementsByTagName('noscript'); foreach ($noscripts as $noscript) { $noscript->parentNode->removeChild($noscript); } return $out; } $doc = new DOMDocument(); // Remove the doctype tag if it is provided. We are going to output // a new doctype tag. if (stripos($intro, '<!DOCTYPE') !== false) { $html = preg_replace('/^[^<]*<\\!DOCTYPE[^>]+>/i', '', $html, 1); } // If we are dealing with XHTML then we need to do some special treatment // for scripts so that they don't F** us up with CDATA stuff. if (defined('SWETE_ENCODE_SCRIPTS') and SWETE_ENCODE_SCRIPTS or stripos($intro, 'XHTML') !== false) { $html = preg_replace_callback('/(<script[^>]*>)([\\s\\S]*?)(<\\/script>)/', array('SweteTools', '_encode_scripts'), $html); } // This was experimental to add a UTF-8 meta tag to fix encoding issues with the DOM // parser. It helped on an HTML5 site, but there were other problems with the site // so we added the optional $USE_HTML5_PARSER flag which fixed the issue properly. //if ( !preg_match('/<meta [^>]*Content-Type[^>]*UTF-8/i', $html) ){ // $html = preg_replace('/(<head[^>]*>)/i', '$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>', $html, 1); //} $res = @$doc->loadHtml('<?xml encoding="UTF-8" ?' . '>' . $html); if (!$res) { $outfile = tempnam(); file_put_contents($outfile, $orig); error_log("Failed to parse HTML: " . $outfile); throw new Exception("Failed to parse HTML"); } // dirty fix foreach ($doc->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $doc->removeChild($item); } } // remove hack $doc->encoding = 'UTF-8'; // insert proper return $doc; }
/** * @brief Converts HTML from source space to proxy space. This will convert * all URLs. * * @param string $html The source HTML. * @return string The proxified HTML with URLs converted. */ public function proxifyHtml($html) { $fullDoc = true; $doc = null; if (is_string($html)) { if (stripos($html, '<body') === false and stripos($html, '<head') === false) { $fullDoc = false; } if ($this->useHtml5Parser) { $intro = substr($html, 0, 255); if (stripos($intro, '<!DOCTYPE html>') !== false) { // this is html5 so we'll use the html5 require_once 'lib/HTML5.php'; $doc = HTML5::loadHTML($html); // noscripts contents are treated like text which causes problems when // filters/replacements are run on them. Let's just remove them $noscripts = $doc->getElementsByTagName('noscript'); foreach ($noscripts as $noscript) { $noscript->parentNode->removeChild($noscript); } } } if (!isset($doc)) { $doc = new DOMDocument(); $res = @$doc->loadHtml('<?xml encoding="UTF-8">' . $html); // dirty fix foreach ($doc->childNodes as $item) { if ($item->nodeType == XML_PI_NODE) { $doc->removeChild($item); } } // remove hack $doc->encoding = 'UTF-8'; // insert proper if (!$res) { throw new Exception("Failed to convert to HTML. Expecting Object by got something else."); } } } else { if ($html instanceof DOMDocument) { $doc = $html; } } $xpath = new DOMXPath($doc); $matches = $xpath->query('//*[@href or @src or @action or @src_' . $this->_proxyLang . ' or @href_' . $this->_proxyLang . ']'); foreach ($matches as $match) { //echo "Callback for element"; $this->_domCallback($match); } $matches = $xpath->query('//style'); foreach ($matches as $match) { //echo "Found style: ".$match->textContent; $match->nodeValue = $this->proxifyCss($match->textContent); } $matches = $xpath->query('//*[@style]'); foreach ($matches as $match) { $match->setAttribute('style', $this->proxifyCss($match->getAttribute('style'))); } $body = $xpath->query('//body'); foreach ($body as $b) { $class = ''; if ($b->hasAttribute('class')) { $class = $b->getAttribute('class'); } $class .= ' x-swete-translation-' . $this->_proxyLang; $b->setAttribute('class', $class); } // Now for the script tags $scriptTexts = $xpath->query('//script/text()'); foreach ($scriptTexts as $txt) { if (!trim($txt->nodeValue)) { continue; } $src = json_encode($this->_srcUrl); $src = substr($src, 1, strlen($src) - 2); $dest = json_encode($this->_proxyUrl); $dest = substr($dest, 1, strlen($dest) - 2); $txt->nodeValue = preg_replace('/\\b(' . preg_quote($this->_srcUrl, '/') . ')/', $this->_proxyUrl, $txt->nodeValue); $txt->nodeValue = preg_replace('/\\b(' . preg_quote($src, '/') . ')/', $dest, $txt->nodeValue); } //$html->set_callback(array($this, '_domCallback')); if (!$fullDoc) { $out = $doc->saveXml($xpath->query('//body')->item(0)); $start = strpos($out, '>') + 1; $end = strrpos($out, '<'); $out = substr($out, $start, $end - $start); } else { $out = $doc->saveHtml(); } unset($doc); return $out; }