示例#1
0
function get_username_list($content)
{
    $dom = HTML5::loadHTML($content);
    $ret = array();
    foreach ($dom->getElementsByTagName('a') as $key => $node) {
        $href = $node->getAttribute('href');
        if (preg_match('%/people/(.+)$%', $href, $matches)) {
            $username = $matches[1];
            $ret[$username] = $node->textContent;
        }
    }
    return $ret;
}
 $t = timer();
 $avg = intval(get_average($t, 'user page'));
 echo "[{$code}]\t{$t} ms\tAvg: {$avg} ms\n";
 if ($code == 404) {
     slog("user {$username} fetch fail, code {$code}");
     User::updateByUserName($username, array('has_fetch' => true, 'fetch_fail' => true));
     echo "没有这个用户 {$username}\n";
     continue;
 }
 if ($code != 200) {
     slog("user {$username} fetch fail, code {$code}");
     User::updateByUserName($username, array('has_fetch' => true, 'fetch_fail' => true));
     echo "奇奇怪怪的返回码 {$code}\n";
     continue;
 }
 $dom = HTML5::loadHTML($content);
 $dom = $dom->getElementById('zh-pm-page-wrap');
 foreach ($dom->getElementsByTagName('img') as $key => $node) {
     if (($attr = $node->getAttribute('class')) == 'zm-profile-header-img zg-avatar-big zm-avatar-editor-preview') {
         $src = $node->getAttribute('src');
     }
 }
 User::updateByUserName($username, array('avatar' => $src));
 $link_list = get_answer_link_list($content);
 $rs = Answer::saveAnswer($base_url, $username, $link_list);
 $num = get_page_num($content);
 if ($num > 1) {
     foreach (range(2, $num) as $i) {
         echo "\nNo. {$n} fetch page {$i}\t";
         $url_page = "{$url}?page={$i}";
         timer();
    public function extractStrings($html)
    {
        $dom = null;
        if ($this->useHtml5Parser) {
            $intro = substr($html, 0, 255);
            if (stripos($intro, '<!DOCTYPE html>') !== false) {
                // this is html5 so we'll use the html5
                require_once 'lib/HTML5.php';
                $options = new StdClass();
                $options->decorateDocument = function (DOMDocument $dom) {
                    $dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
                };
                $dom = HTML5::loadHTML($html, $options);
                // noscripts contents are treated like text which causes problems when
                // filters/replacements are run on them.  Let's just remove them
                $noscripts = $dom->getElementsByTagName('noscript');
                foreach ($noscripts as $noscript) {
                    $noscript->parentNode->removeChild($noscript);
                }
            }
        }
        //$dom = str_get_html($html);
        if (!isset($dom)) {
            $dom = new DOMDocument();
            $dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
            @$dom->loadHtml('<?xml encoding="UTF-8">' . $html);
            // dirty fix
            foreach ($dom->childNodes as $item) {
                if ($item->nodeType == XML_PI_NODE) {
                    $dom->removeChild($item);
                }
            }
            // remove hack
            $dom->encoding = 'UTF-8';
            // insert proper
        }
        //print_r($dom);
        $strings = array();
        $this->strings =& $strings;
        $stringsIndex = array();
        $xpath = new DOMXPath($dom);
        $this->translateDates($xpath);
        //$text = $xpath->query('//text()[normalize-space() and not(ancestor::script | ancestor::style)]');
        //$translatables = $dom->find('[translate]');
        $translateAttrs = $xpath->query('//*[@data-swete-translate-attrs or @alt or @title]');
        $otherAtts = array('title', 'alt');
        foreach ($translateAttrs as $el) {
            if ($el->hasAttribute('data-swete-translate-attrs')) {
                $attNames = explode(' ', $el->getAttribute('data-swete-translate-attrs'));
            } else {
                $attNames = array();
            }
            foreach ($otherAtts as $attName) {
                if ($el->hasAttribute($attName)) {
                    $attNames[] = $attName;
                }
            }
            foreach ($attNames as $attName) {
                $attVal = $el->getAttribute($attName);
                if ($attVal and trim($attVal)) {
                    $index = count($strings);
                    $strings[] = trim(_n($attVal));
                    $stringsIndex[trim(_n($attVal))] = $index;
                    $el->setAttribute($attName, '{{$' . $index . '$}}');
                    $index++;
                }
            }
        }
        $translatables = $xpath->query('//*[@translate]');
        foreach ($translatables as $tr) {
            $index = count($strings);
            //$strings[] = trim(_n($tr->innertext));
            //$strings[] = trim(_n($tr->innerHTML));
            $trStr = trim(_n($tr->innerHTML));
            if ($tr->hasAttribute('data-swete-delimiters')) {
                $delim = trim($tr->getAttribute('data-swete-delimiters'));
                if ($delim) {
                    $delimSplitter = $delim[0];
                    $delimiters = explode($delimSplitter, $delim);
                    $delimiters2 = array();
                    foreach ($delimiters as $delimiterIdx => $delimiter) {
                        if (!trim($delimiter)) {
                            continue;
                        }
                        $delimiters2[] = '(' . preg_quote($delimiter, '/') . ')';
                    }
                    $delimiters = $delimiters2;
                    $pattern = '/' . implode('|', $delimiters) . '/';
                    $toks = preg_split($pattern, $trStr, -1, PREG_SPLIT_DELIM_CAPTURE);
                    $innerHTML = array();
                    foreach ($toks as $tokIdx => $tok) {
                        if (!trim($tok)) {
                            $innerHTML[] = $tok;
                        } else {
                            if ($tokIdx % 2 === 1) {
                                // It is a delimiter
                                $innerHTML[] = $tok;
                            } else {
                                $strings[] = trim(_n($tok));
                                $stringsIndex[trim(_n($tok))] = $index;
                                $innerHTML[] = '{{$' . $index . '$}}';
                                $index++;
                                if ($tok[strlen($tok) - 1] === ' ') {
                                    $innerHTML[] = ' ';
                                }
                            }
                        }
                    }
                    $tr->innerHTML = implode('', $innerHTML);
                    $trStr = '';
                }
            }
            if ($trStr) {
                $strings[] = trim(_n($trStr));
                $stringsIndex[trim(_n($trStr))] = $index;
                $tr->innerHTML = '{{$' . $index . '$}}';
                $index++;
            }
            $gchildren = $xpath->query('./text()', $tr);
            foreach ($gchildren as $gchild) {
                $gchild->isCovered = 1;
            }
        }
        //$untranslatables = $dom->find('[notranslate]');
        $untranslatables = $xpath->query('//*[@notranslate]');
        foreach ($untranslatables as $tr) {
            //error_log('Found untranslatable: '.$tr->outertext);
            //$gchildren = $tr->find('text');
            $gchildren = $xpath->query('./text()', $tr);
            //error_log(count($gchildren).' found');
            //foreach ($gchildren as $gchild) $gchild->isCovered = 1;
            foreach ($gchildren as $gchild) {
                $gchild->isCovered = 1;
            }
        }
        $textX = $xpath->query('//text()[not(ancestor::script | ancestor::style | ancestor::*[@notranslate] | ancestor::*[@translate])]');
        $text = array();
        foreach ($textX as $x) {
            $text[] = $x;
        }
        //echo "Found ".$text->length;
        foreach ($text as $tx) {
            if (!$tx instanceof DOMNode) {
                continue;
            }
            if (!isset($tx->parentNode)) {
                continue;
            }
            if (!$tx->parentNode instanceof DOMElement) {
                continue;
            }
            // the data-swete-translate is a little different than the notranslate attribute
            // the notranslate attribute confers block level status to its owner tag.
            // data-swete-translate simply marks a segment of text as not to be translated
            // (or to be translated) within the flow of the document.  Therefore we don't
            // use a text node whose parent has the data-swete-translate as an anchor
            // to start building a group of text.  But we will allow a tag with this
            // to be included in a group of text (that contains content before and/or after).
            // The SweteTools::encode() method will take care of variablizing the content
            // at translation time.
            if ($tx->parentNode->hasAttribute('data-swete-translate') and $tx->parentNode->getAttribute('data-swete-translate') === '0') {
                continue;
            }
            //if ( !trim($tx->innertext) ) continue;
            if (!trim($tx->nodeValue)) {
                continue;
            }
            //if ( in_array($tx->parent->tag , array('comment','script','style','code') )) continue;
            if (in_array(strtolower($tx->parentNode->tagName), array('comment', 'script', 'style', 'code'))) {
                continue;
            }
            if ($this->isCovered($tx)) {
                //echo "This one's covered!!!";
                continue;
            }
            //echo "[".$tx->nodeValue."]";
            //continue;
            $group = array();
            $start = $tx;
            //if ( $tx->parent->children ){
            if (!isset($tx->parentNode)) {
                //error_log("skipping ".$tx->nodeValue);
                continue;
            }
            if ($tx->parentNode->childNodes->length > 0) {
                $pos = -1;
                //foreach ( $tx->parent->nodes as $idx=>$child ){
                foreach ($tx->parentNode->childNodes as $idx => $child) {
                    if ($child === $tx) {
                        $pos = $idx;
                        break;
                    }
                }
                $mypos = $pos;
                for ($i = $pos; $i >= 0; $i--) {
                    //$node = $tx->parent->nodes[$i];
                    $node = $tx->parentNode->childNodes->item($i);
                    //if ( $node->tag != 'text' and !in_array($node->tag, self::$inlineTags) ){
                    if ($node->nodeType != XML_TEXT_NODE and !in_array(strtolower(@$node->tagName), self::$inlineTags) and !($node instanceof DOMElement and $node->hasAttribute('data-swete-inline'))) {
                        break;
                    }
                    //if ( $node->notranslate ){
                    if ($node instanceof DOMElement and $node->hasAttribute('notranslate')) {
                        break;
                    }
                    if ($node instanceof DOMElement and $node->hasAttribute('data-swete-block')) {
                        break;
                    }
                    $pos = $i;
                }
                //if ( $mypos == $pos or $this->isFirstText($tx->parent, $mypos, $pos)){
                if ($mypos == $pos or $this->isFirstText($tx->parentNode, $mypos, $pos)) {
                    $startIdx = $pos;
                    //for ( $i=$startIdx; $i<count($tx->parent->nodes); $i++ ){
                    for ($i = $startIdx; $i < $tx->parentNode->childNodes->length; $i++) {
                        //$node = $tx->parent->nodes[$i];
                        $node = $tx->parentNode->childNodes->item($i);
                        if (!$node) {
                            break;
                        }
                        //if ( $node->tag != 'text' and !in_array($node->tag, self::$inlineTags) ){
                        if ($node->nodeType != XML_TEXT_NODE and !in_array(strtolower(@$node->tagName), self::$inlineTags) and !($node instanceof DOMElement and $node->hasAttribute('data-swete-inline'))) {
                            break;
                        }
                        //if ( $node->notranslate ){
                        if ($node instanceof DOMElement and $node->hasAttribute('notranslate')) {
                            break;
                        }
                        if ($node instanceof DOMElement and $node->hasAttribute('data-swete-block')) {
                            break;
                        }
                        //if ( $node->tag != 'text' ){
                        //	if ( preg_match('/^<'.$node->tag.'[^>]*>/', $node->outertext, $matches) ){
                        //
                        //		$node->outertext = preg_replace('/^<'.$node->tag.'([^>]*)>/', '<'.$node->tag.' id="{{R'.count($this->replacements).'R}}">', $node->outertext);
                        //		$this->replacements[] = $matches[0];
                        //	}
                        //
                        //}
                        $group[] = $node;
                    }
                }
            } else {
                $group[] = $tx;
            }
            $combinedText = array();
            foreach ($group as $item) {
                //$combinedText[] = trim($item->outertext);
                // REquires PHP 5.3.6 or higher.. passing element to saveHtml()
                $combinedText[] = preg_replace_callback('#<(\\w+)([^>]*)\\s*/>#s', create_function('$m', '
					$xhtml_tags = array("br", "hr", "input", "frame", "img", "area", "link", "col", "base", "basefont", "param");
					return in_array($m[1], $xhtml_tags) ? "<$m[1]$m[2]/>" : "<$m[1]$m[2]></$m[1]>";
					'), $dom->saveXml($item));
            }
            //var_dump($combinedText);
            $combinedText = implode('', $combinedText);
            $leadingWhiteSpace = '';
            $trailingWhiteSpace = '';
            if (preg_match('#^[\\p{Z}\\s]+#', $combinedText, $m1)) {
                $leadingWhiteSpace = $m1[0];
            }
            //echo 'Checking for trailing space: ['.$combinedText.']'."\n";
            if (preg_match('#[\\p{Z}\\s]+$#', $combinedText, $m1)) {
                //echo "Trailing white space found in '$combinedText'\n";
                $trailingWhiteSpace = $m1[0];
            } else {
                //echo "No trailing whitespace found.".ord($combinedText{strlen($combinedText)-1});
            }
            $combinedText = _n($this->replaceStrings($combinedText));
            if (!trim(str_ireplace('&nbsp;', '', $combinedText))) {
                continue;
            }
            if (isset($stringsIndex[$combinedText])) {
                $index = $stringsIndex[$combinedText];
            } else {
                $index = count($strings);
                $strings[] = $combinedText;
                $stringsIndex[$combinedText] = $index;
            }
            foreach ($group as $gnode) {
                //$gchildren = $gnode->find('text');
                $gchildren = @$xpath->query('./text()', $gnode);
                if (!$gchildren) {
                    continue;
                }
                foreach ($gchildren as $gchild) {
                    $gchild->isCovered = 1;
                }
            }
            //$group[0]->outertext = '{{$'.$index.'$}}';
            //$group[0]->nodeValue = '{{$'.$index.'$}}';
            for ($i = 1; $i < count($group); $i++) {
                //$group[$i]->outertext = '';
                //if ( !@$group[$i] ) continue;
                if (@$group[$i]->parentNode) {
                    $group[$i]->parentNode->removeChild($group[$i]);
                }
            }
            if (!@$group[0]) {
                continue;
            }
            if (!@$group[0]->parentNode) {
                continue;
            }
            $textNodeContent = $leadingWhiteSpace . '{{$' . $index . '$}}' . $trailingWhiteSpace;
            //echo 'Content:['.$textNodeContent.']'."\n";
            $group[0]->parentNode->replaceChild($dom->createTextNode($textNodeContent), $group[0]);
        }
        // Now we need to translate the keywords and the description
        //foreach ($dom->find('meta') as $el){
        foreach ($xpath->query('//meta[@name="keywords" or @name="description"]') as $el) {
            //$content = _n($el->content);
            if (!$el->hasAttribute('content')) {
                continue;
            }
            $content = _n($el->getAttribute('content'));
            //if ( $content and in_array(strtolower(strval($el->name)), array('keywords','description')) ){
            if (isset($stringsIndex[$content])) {
                $index = $stringsIndex[$content];
            } else {
                $index = count($strings);
                $strings[] = $content;
                $stringsIndex[$content] = $index;
            }
            //$el->content = '{{$'.$index.'$}}';
            $el->setAttribute('content', '{{$' . $index . '$}}');
            //}
        }
        $this->strings = array_map(array($this, 'cleanString'), $this->strings);
        //return $dom->save();
        return $dom->saveHtml();
    }
示例#4
0
 /**
  * Create instance of Readability
  * @param string UTF-8 encoded string
  * @param string (optional) URL associated with HTML (used for footnotes)
  * @param string which parser to use for turning raw HTML into a DOMDocument (either 'libxml' or 'html5lib')
  */
 function __construct($html, $url = null, $parser = 'libxml')
 {
     $this->url = $url;
     /* Turn all double br's into p's */
     $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
     $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
     $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
     if (trim($html) == '') {
         $html = '<html></html>';
     }
     if ($parser == 'html5lib' || $parser == 'html5php') {
         if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
             $this->dom = HTML5::loadHTML($html);
         }
     }
     if ($this->dom === null) {
         $this->dom = new DOMDocument();
         $this->dom->preserveWhiteSpace = false;
         @$this->dom->loadHTML($html);
     }
     $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
 }
示例#5
0
 static function loadHtml($html)
 {
     $intro = substr($html, 0, 255);
     if (self::$USE_HTML5_PARSER and stripos($intro, '<!DOCTYPE html>') !== false) {
         // this is html5 so we'll use the html5
         require_once 'lib/HTML5.php';
         $out = HTML5::loadHTML($html);
         // noscripts contents are treated like text which causes problems when
         // filters/replacements are run on them.  Let's just remove them
         $noscripts = $out->getElementsByTagName('noscript');
         foreach ($noscripts as $noscript) {
             $noscript->parentNode->removeChild($noscript);
         }
         return $out;
     }
     $doc = new DOMDocument();
     // Remove the doctype tag if it is provided.  We are going to output
     // a new doctype tag.
     if (stripos($intro, '<!DOCTYPE') !== false) {
         $html = preg_replace('/^[^<]*<\\!DOCTYPE[^>]+>/i', '', $html, 1);
     }
     // If we are dealing with XHTML then we need to do some special treatment
     // for scripts so that they don't F** us up with CDATA stuff.
     if (defined('SWETE_ENCODE_SCRIPTS') and SWETE_ENCODE_SCRIPTS or stripos($intro, 'XHTML') !== false) {
         $html = preg_replace_callback('/(<script[^>]*>)([\\s\\S]*?)(<\\/script>)/', array('SweteTools', '_encode_scripts'), $html);
     }
     // This was experimental to add a UTF-8 meta tag to fix encoding issues with the DOM
     // parser. It helped on an HTML5 site, but there were other problems with the site
     // so we added the optional $USE_HTML5_PARSER flag which fixed the issue properly.
     //if ( !preg_match('/<meta [^>]*Content-Type[^>]*UTF-8/i', $html) ){
     //    $html = preg_replace('/(<head[^>]*>)/i', '$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>', $html, 1);
     //}
     $res = @$doc->loadHtml('<?xml encoding="UTF-8" ?' . '>' . $html);
     if (!$res) {
         $outfile = tempnam();
         file_put_contents($outfile, $orig);
         error_log("Failed to parse HTML: " . $outfile);
         throw new Exception("Failed to parse HTML");
     }
     // dirty fix
     foreach ($doc->childNodes as $item) {
         if ($item->nodeType == XML_PI_NODE) {
             $doc->removeChild($item);
         }
     }
     // remove hack
     $doc->encoding = 'UTF-8';
     // insert proper
     return $doc;
 }
示例#6
0
 /**
  * @brief Converts HTML from source space to proxy space.  This will convert
  * all URLs.
  *
  * @param string $html The source HTML.
  * @return string The proxified HTML with URLs converted.
  */
 public function proxifyHtml($html)
 {
     $fullDoc = true;
     $doc = null;
     if (is_string($html)) {
         if (stripos($html, '<body') === false and stripos($html, '<head') === false) {
             $fullDoc = false;
         }
         if ($this->useHtml5Parser) {
             $intro = substr($html, 0, 255);
             if (stripos($intro, '<!DOCTYPE html>') !== false) {
                 // this is html5 so we'll use the html5
                 require_once 'lib/HTML5.php';
                 $doc = HTML5::loadHTML($html);
                 // noscripts contents are treated like text which causes problems when
                 // filters/replacements are run on them.  Let's just remove them
                 $noscripts = $doc->getElementsByTagName('noscript');
                 foreach ($noscripts as $noscript) {
                     $noscript->parentNode->removeChild($noscript);
                 }
             }
         }
         if (!isset($doc)) {
             $doc = new DOMDocument();
             $res = @$doc->loadHtml('<?xml encoding="UTF-8">' . $html);
             // dirty fix
             foreach ($doc->childNodes as $item) {
                 if ($item->nodeType == XML_PI_NODE) {
                     $doc->removeChild($item);
                 }
             }
             // remove hack
             $doc->encoding = 'UTF-8';
             // insert proper
             if (!$res) {
                 throw new Exception("Failed to convert to HTML.  Expecting Object by got something else.");
             }
         }
     } else {
         if ($html instanceof DOMDocument) {
             $doc = $html;
         }
     }
     $xpath = new DOMXPath($doc);
     $matches = $xpath->query('//*[@href or @src or @action or @src_' . $this->_proxyLang . ' or @href_' . $this->_proxyLang . ']');
     foreach ($matches as $match) {
         //echo "Callback for element";
         $this->_domCallback($match);
     }
     $matches = $xpath->query('//style');
     foreach ($matches as $match) {
         //echo "Found style: ".$match->textContent;
         $match->nodeValue = $this->proxifyCss($match->textContent);
     }
     $matches = $xpath->query('//*[@style]');
     foreach ($matches as $match) {
         $match->setAttribute('style', $this->proxifyCss($match->getAttribute('style')));
     }
     $body = $xpath->query('//body');
     foreach ($body as $b) {
         $class = '';
         if ($b->hasAttribute('class')) {
             $class = $b->getAttribute('class');
         }
         $class .= ' x-swete-translation-' . $this->_proxyLang;
         $b->setAttribute('class', $class);
     }
     // Now for the script tags
     $scriptTexts = $xpath->query('//script/text()');
     foreach ($scriptTexts as $txt) {
         if (!trim($txt->nodeValue)) {
             continue;
         }
         $src = json_encode($this->_srcUrl);
         $src = substr($src, 1, strlen($src) - 2);
         $dest = json_encode($this->_proxyUrl);
         $dest = substr($dest, 1, strlen($dest) - 2);
         $txt->nodeValue = preg_replace('/\\b(' . preg_quote($this->_srcUrl, '/') . ')/', $this->_proxyUrl, $txt->nodeValue);
         $txt->nodeValue = preg_replace('/\\b(' . preg_quote($src, '/') . ')/', $dest, $txt->nodeValue);
     }
     //$html->set_callback(array($this, '_domCallback'));
     if (!$fullDoc) {
         $out = $doc->saveXml($xpath->query('//body')->item(0));
         $start = strpos($out, '>') + 1;
         $end = strrpos($out, '<');
         $out = substr($out, $start, $end - $start);
     } else {
         $out = $doc->saveHtml();
     }
     unset($doc);
     return $out;
 }