コード例 #1
0
function processListSection(&$dom, &$sec, $beef, $aresteps = true, $elem = "step")
{
    global $wgOut;
    $toks = preg_split("@(^[#\\*]+)@im", $beef, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
    $substeps = null;
    while (sizeof($toks) > 0) {
        $x = grabNextToken(&$toks);
        if ($aresteps && preg_match("@^#[#\\*]@", $x)) {
            if ($substeps == null) {
                $substeps = $dom->createElement("substeps");
            }
        } else {
            if ($substeps) {
                $sec->appendChild($substeps);
            }
            $substeps = null;
        }
        $x = grabNextToken(&$toks);
        $s = $dom->createElement($elem);
        handleImages($x, &$dom, &$s);
        $t = $dom->createElement("text");
        $x = cleanUpText($x);
        if ($x == "") {
            continue;
        }
        $t->appendChild($dom->createTextNode($x));
        $s->appendChild($t);
        if ($substeps) {
            $substeps->appendChild($s);
        } else {
            $sec->appendChild($s);
        }
    }
    if ($substeps) {
        $sec->appendChild($substeps);
    }
    return;
}
コード例 #2
0
/**
 * Creates an issue's HTML
 *
 * @param string $dateDirectory Issue's directory
 * @param bool   $withImages    Gather images as well
 * @param array  $urls          Article URLs
 * @param array  $ids           Article IDs
 *
 * @return void
 *
 */
function createHTML($dateDirectory, $withImages, $urls, $ids)
{
    $indexFile = $dateDirectory . '/economist.html';
    $indexFileH = fopen($indexFile, 'w');
    fwrite($indexFileH, '<head><meta http-equiv="content-type" content="text/html; charset=UTF-8"/></head>');
    fwrite($indexFileH, '<body>');
    // write all urls
    echo "Creating articles file: economist.html:\n";
    for ($i = 0; $i < count($urls); $i++) {
        // filter out some URLs
        if (stristr($urls[$i], '/comments') !== false || stristr($urls[$i], '/subscribe') !== false || stristr($urls[$i], '/covers') !== false) {
            continue;
        }
        $url = substr($urls[$i], 0, 5) === 'http:' ? $urls[$i] : 'http://www.economist.com/' . $urls[$i];
        $url = str_replace('.com//', '.com/', $url);
        echo "\t{$url}\n";
        // download article
        $article = economistGetUrl($url);
        $articleId = $ids[$i];
        if ($article === '') {
            echo "\t\tERROR in downloading!\n";
        }
        $startPos = strpos($article, '<div id="ec-article-body"');
        $endPos = strpos($article, '<!-- /#ec-article-body -->');
        $fh = fopen('/tmp/article', 'w');
        fwrite($fh, $article);
        fclose($fh);
        if ($startPos > 0 && $endPos > 0) {
            // Parse regular content
            $content = substr($article, $startPos, $endPos - $startPos);
            //
            // 2011-07-26:
            // An update to economist.com changed
            //    <div id="ec-article-body">
            // to <div id="ec-article-body" class="clearfix">
            // To be more resilient to changes like this, only match the start of the DIV, then
            //   eat characters up to the first '>'
            //
            $closeTag = strpos($content, '>');
            if ($closeTag > 0) {
                $content = substr($content, $closeTag + 1);
            }
            $headline = '(unknown headline)';
            if (preg_match('#<div class=.headline.>([^\\<]+)#', $content, $matches) > 0) {
                $headline = $matches[1];
            } else {
                if (preg_match('#<h[1-5] class=.headline.>([^\\<]+)#', $content, $matches) > 0) {
                    $headline = $matches[1];
                }
            }
            // add TOC link to article
            $content = preg_replace('/class="headline"/', 'class="headline" id="' . $articleId . '"', $content);
            echo "\t\t=> " . $headline . "\n";
        } else {
            // Parse other content (e.g. KAL's cartoon)
            $startPos = strpos($article, '<div id="content">');
            $endPos = strpos($article, '<div id="add-comment-container">');
            $content = substr($article, $startPos, $endPos - $startPos);
            $startPos = strpos($content, '<h1>');
            $content = substr($content, $startPos);
            $endPos = strpos($content, '</div>');
            $content = substr($content, 0, $endPos) . '</div>';
            // add TOC link to article
            $content = preg_replace('/<h1>/', '<h1 id="' . $articleId . '">', $content);
        }
        // get rid of this: <p class="info">Feb 28th 2008<br>From <em>The Economist</em> print edition</p>
        $content = preg_replace('/<p class="info">(.*?)<\\/p>/', '', $content);
        // get rid of banner ads
        $content = preg_replace('/<div class="banner">([[:space:]])*<div align="center">(.*?)<\\/div>([[:space:]])*<\\/div>/s', '', $content);
        // fix topics links
        $content = preg_replace('/href="\\/topics\\//s', 'href="http://www.economist.com/topics/', $content);
        // allow inside-links
        $content = preg_replace('/<a href="displaystory.cfm\\?story_id=/s', '<a href="#', $content);
        $content = preg_replace('/<a href="\\/news\\/[a-zA-Z\\-]+\\/([0-9]+)[0-9a-zA-Z\\-]+/s', '<a href="#$1', $content);
        $content = preg_replace('/<a href="http:\\/\\/www.economist.com\\/news\\/[a-zA-Z\\-]+\\/([0-9]+)[0-9a-zA-Z\\-]+/s', '<a href="#$1', $content);
        // add extra line above section headers
        $content = preg_replace('#</a><br \\/>#', '</a><br><br>', $content);
        if ($withImages) {
            $content = handleImages($content, $dateDirectory);
        }
        if ($content === '') {
            echo "\t\tERROR in processing!\n";
        }
        fwrite($indexFileH, "\n\n");
        fwrite($indexFileH, $content);
        //add page break to bottom of each article
        fwrite($indexFileH, "\n<mbp:pagebreak/>");
    }
    fwrite($indexFileH, '</body>');
    fclose($indexFileH);
}