function processListSection(&$dom, &$sec, $beef, $aresteps = true, $elem = "step") { global $wgOut; $toks = preg_split("@(^[#\\*]+)@im", $beef, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); $substeps = null; while (sizeof($toks) > 0) { $x = grabNextToken(&$toks); if ($aresteps && preg_match("@^#[#\\*]@", $x)) { if ($substeps == null) { $substeps = $dom->createElement("substeps"); } } else { if ($substeps) { $sec->appendChild($substeps); } $substeps = null; } $x = grabNextToken(&$toks); $s = $dom->createElement($elem); handleImages($x, &$dom, &$s); $t = $dom->createElement("text"); $x = cleanUpText($x); if ($x == "") { continue; } $t->appendChild($dom->createTextNode($x)); $s->appendChild($t); if ($substeps) { $substeps->appendChild($s); } else { $sec->appendChild($s); } } if ($substeps) { $sec->appendChild($substeps); } return; }
/** * Creates an issue's HTML * * @param string $dateDirectory Issue's directory * @param bool $withImages Gather images as well * @param array $urls Article URLs * @param array $ids Article IDs * * @return void * */ function createHTML($dateDirectory, $withImages, $urls, $ids) { $indexFile = $dateDirectory . '/economist.html'; $indexFileH = fopen($indexFile, 'w'); fwrite($indexFileH, '<head><meta http-equiv="content-type" content="text/html; charset=UTF-8"/></head>'); fwrite($indexFileH, '<body>'); // write all urls echo "Creating articles file: economist.html:\n"; for ($i = 0; $i < count($urls); $i++) { // filter out some URLs if (stristr($urls[$i], '/comments') !== false || stristr($urls[$i], '/subscribe') !== false || stristr($urls[$i], '/covers') !== false) { continue; } $url = substr($urls[$i], 0, 5) === 'http:' ? $urls[$i] : 'http://www.economist.com/' . $urls[$i]; $url = str_replace('.com//', '.com/', $url); echo "\t{$url}\n"; // download article $article = economistGetUrl($url); $articleId = $ids[$i]; if ($article === '') { echo "\t\tERROR in downloading!\n"; } $startPos = strpos($article, '<div id="ec-article-body"'); $endPos = strpos($article, '<!-- /#ec-article-body -->'); $fh = fopen('/tmp/article', 'w'); fwrite($fh, $article); fclose($fh); if ($startPos > 0 && $endPos > 0) { // Parse regular content $content = substr($article, $startPos, $endPos - $startPos); // // 2011-07-26: // An update to economist.com changed // <div id="ec-article-body"> // to <div id="ec-article-body" class="clearfix"> // To be more resilient to changes like this, only match the start of the DIV, then // eat characters up to the first '>' // $closeTag = strpos($content, '>'); if ($closeTag > 0) { $content = substr($content, $closeTag + 1); } $headline = '(unknown headline)'; if (preg_match('#<div class=.headline.>([^\\<]+)#', $content, $matches) > 0) { $headline = $matches[1]; } else { if (preg_match('#<h[1-5] class=.headline.>([^\\<]+)#', $content, $matches) > 0) { $headline = $matches[1]; } } // add TOC link to article $content = preg_replace('/class="headline"/', 'class="headline" id="' . $articleId . '"', $content); echo "\t\t=> " . $headline . "\n"; } else { // Parse other content (e.g. KAL's cartoon) $startPos = strpos($article, '<div id="content">'); $endPos = strpos($article, '<div id="add-comment-container">'); $content = substr($article, $startPos, $endPos - $startPos); $startPos = strpos($content, '<h1>'); $content = substr($content, $startPos); $endPos = strpos($content, '</div>'); $content = substr($content, 0, $endPos) . '</div>'; // add TOC link to article $content = preg_replace('/<h1>/', '<h1 id="' . $articleId . '">', $content); } // get rid of this: <p class="info">Feb 28th 2008<br>From <em>The Economist</em> print edition</p> $content = preg_replace('/<p class="info">(.*?)<\\/p>/', '', $content); // get rid of banner ads $content = preg_replace('/<div class="banner">([[:space:]])*<div align="center">(.*?)<\\/div>([[:space:]])*<\\/div>/s', '', $content); // fix topics links $content = preg_replace('/href="\\/topics\\//s', 'href="http://www.economist.com/topics/', $content); // allow inside-links $content = preg_replace('/<a href="displaystory.cfm\\?story_id=/s', '<a href="#', $content); $content = preg_replace('/<a href="\\/news\\/[a-zA-Z\\-]+\\/([0-9]+)[0-9a-zA-Z\\-]+/s', '<a href="#$1', $content); $content = preg_replace('/<a href="http:\\/\\/www.economist.com\\/news\\/[a-zA-Z\\-]+\\/([0-9]+)[0-9a-zA-Z\\-]+/s', '<a href="#$1', $content); // add extra line above section headers $content = preg_replace('#</a><br \\/>#', '</a><br><br>', $content); if ($withImages) { $content = handleImages($content, $dateDirectory); } if ($content === '') { echo "\t\tERROR in processing!\n"; } fwrite($indexFileH, "\n\n"); fwrite($indexFileH, $content); //add page break to bottom of each article fwrite($indexFileH, "\n<mbp:pagebreak/>"); } fwrite($indexFileH, '</body>'); fclose($indexFileH); }