public function test_getNode() { $node = "<node><a>lorem</a><b>ipsum</b></node>"; $parser = Mockery::mock("\\Prewk\\XmlStringStreamer\\ParserInterface"); $parser->shouldReceive("getNodeFrom")->with(Mockery::type("\\Prewk\\XmlStringStreamer\\StreamInterface"))->once()->andReturn($node); $stream = Mockery::mock("\\Prewk\\XmlStringStreamer\\StreamInterface"); $streamer = new XmlStringStreamer($parser, $stream); $this->assertEquals($node, $streamer->getNode(), "Node received from the parser should be what was expected"); }
/** * @param string $url * @param XMLHandler $handler * @return int */ public function parse($url, XMLHandler $handler) { $stream = new Stream\Guzzle($url, self::CHUNK_SIZE); $parser = new Parser\StringWalker(); $streamer = new XmlStringStreamer($parser, $stream); $countOfProducts = 0; while ($node = $streamer->getNode()) { $simpleXmlNode = simplexml_load_string($node); $handler->perform($simpleXmlNode); $countOfProducts++; } return $countOfProducts; }
public function run() { Countrycode::truncate(); $CHUNK_SIZE = 1024; $streamProvider = new Stream\File(dirname(__FILE__) . "/countrycodes.xml", $CHUNK_SIZE); $config = array("uniqueNode" => "row"); $parser = new Parser\UniqueNode($config); $streamer = new XmlStringStreamer($parser, $streamProvider); while ($node = $streamer->getNode()) { $simpleXmlNode = simplexml_load_string($node); Countrycode::create(['countrycode' => $simpleXmlNode->field[0], 'country' => $simpleXmlNode->field[1]]); } $this->command->info('Countrycode table seeded!'); }
/** * {@inheritdoc} */ public function isSpamReferrer(Url $url) { $url = $url->toArray(); if (!isset($url['registerableDomain'], $url['host'], $url['publicSuffix'])) { return false; } $provider = new File($this->file, 1024); $parser = new XmlStringStreamer\Parser\StringWalker(); $streamer = new XmlStringStreamer($parser, $provider); while ($node = $streamer->getNode()) { $domain = (string) simplexml_load_string($node); if (in_array($domain, [$url['registerableDomain'], $url['host'], $url['publicSuffix']])) { return true; } } return false; }
/** * @param string|XmlStringStreamer $xml * @return object[] */ public function unserialize($xml) { $hydrator = $this->buildHydrator('xml', 'hydrate'); $class = $this->getOptions()->getClass(); $classes = array(); if ($xml instanceof XmlStringStreamer) { while ($node = $xml->getNode()) { $node = simplexml_load_string($node); $classes[] = $hydrator->hydrate((array) $node, new $class()); } } else { $docElement = simplexml_load_string($xml)->children(); $name = $docElement->getName(); foreach ($docElement->{$name} as $node) { $classes[] = $hydrator->hydrate((array) $node, new $class()); } } return $classes; }
protected function execute(InputInterface $input, OutputInterface $output) { $this->users = []; $this->filesystem = new Filesystem(); $this->titleFilter = new TitleFilter(); $displayIndex = $input->getOption('indexes'); $displayAuthor = $input->getOption('display-author'); $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through $revMaxHops = (int) $input->getOption('max-revs'); // Maximum number of revisions per page we go through $listMissed = $input->getOption('missed'); $counter = 0; // Increment the number of pages we are going through $redirects = []; $pages = []; $urlParts = []; $missedIndexes = []; $urlsWithContent = []; $moreThanHundredRevs = []; $translations = []; $sanity_redirs = []; $directlyOnRoot = []; $rev_count = []; // So we can know what’s the average // Pages we have to make sure aren’t duplicate on the CMS prior // to the final migration. $temporary_acceptable_duplicates = []; //$temporary_acceptable_duplicates[] = 'css/selectors/pseudo-classes/:lang'; // DONE if ($listMissed === true) { $output->writeln('We are going to try to give you XML indexes to use for --retry=..., we will therefore limit the revision loops to one.'); $missed_file = DATA_DIR . '/missed.yml'; if (realpath($missed_file) === false) { throw new Exception(sprintf('Could not find missed file at %s', $missed_file)); } $missedFileContents = file_get_contents($missed_file); $parser = new Yaml\Parser(); try { $missed = $parser->parse($missedFileContents); } catch (Exception $e) { throw new Exception(sprintf('Could not get file %s contents to be parsed as YAML. Is it in YAML format?', $missed_file), null, $e); } if (!isset($missed['missed'])) { throw new Exception('Please ensure missed.yml has a list of titles under a "missed:" top level key'); } $revMaxHops = 1; $this->missed = $missed['missed']; } /** * Last minute redirects. Order matters. */ $redirects['after'] = 'css/selectors/pseudo-elements/after'; $redirects['tutorials/What_is_CSS'] = 'tutorials/learning_what_css_is'; $redirects['html/attributes/type type (a, link, embed)'] = 'html/attributes/type'; /* -------------------- Author -------------------- * * Author array of MediaWikiContributor objects with $this->users[$uid], * where $uid is MediaWiki user_id. * * You may have to increase memory_limit value, * but we’ll load this only once. **/ $users_file = DATA_DIR . '/users.json'; $users_loop = json_decode(file_get_contents($users_file), 1); foreach ($users_loop as &$u) { $uid = (int) $u['user_id']; $this->users[$uid] = new MediaWikiContributor($u); unset($u); // Dont fill too much memory, if that helps. } /* -------------------- /Author -------------------- **/ /* -------------------- XML source -------------------- **/ $file = DATA_DIR . '/dumps/main_full.xml'; $streamer = XmlStringStreamer::createStringWalkerParser($file); /* -------------------- /XML source -------------------- **/ while ($node = $streamer->getNode()) { if ($maxHops > 0 && $maxHops === $counter) { $output->writeln(sprintf('Reached desired maximum of %d loops', $maxHops) . PHP_EOL . PHP_EOL); break; } $pageNode = new SimpleXMLElement($node); if (isset($pageNode->title)) { $wikiDocument = new MediaWikiDocument($pageNode); $persistable = new GitCommitFileRevision($wikiDocument, 'out/content/', '.md'); $title = $wikiDocument->getTitle(); $normalized_location = $wikiDocument->getName(); $file_path = $this->titleFilter->filter($persistable->getName()); $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect()); // False if not a redirect, string if it is $is_translation = $wikiDocument->isTranslation(); $language_code = $wikiDocument->getLanguageCode(); $language_name = $wikiDocument->getLanguageName(); $revs = $wikiDocument->getRevisions()->count(); $output->writeln(sprintf('"%s":', $title)); if ($displayIndex === true) { $output->writeln(sprintf(' - index: %d', $counter)); } $output->writeln(sprintf(' - normalized: %s', $normalized_location)); $output->writeln(sprintf(' - file: %s', $file_path)); if ($wikiDocument->hasRedirect() === true) { $output->writeln(sprintf(' - redirect_to: %s', $redirect_to)); } else { $urlsWithContent[] = $title; foreach (explode('/', $normalized_location) as $urlDepth => $urlPart) { $urlPartKey = strtolower($urlPart); $urlParts[$urlPartKey] = $urlPart; $urlPartsAll[$urlPartKey][] = $urlPart; } } if ($is_translation === true) { $output->writeln(sprintf(' - lang: %s (%s)', $language_code, $language_name)); } if ($listMissed === true && in_array($normalized_location, $this->missed)) { $missedIndexes[$counter] = $title; } $output->writeln(sprintf(' - revs: %d', $revs)); $output->writeln(sprintf(' - revisions:')); $revList = $wikiDocument->getRevisions(); $revLast = $wikiDocument->getLatest(); $revCounter = 0; /* ----------- REVISION --------------- **/ for ($revList->rewind(); $revList->valid(); $revList->next()) { if ($revMaxHops > 0 && $revMaxHops === $revCounter) { $output->writeln(sprintf(' - stop: Reached maximum %d revisions', $revMaxHops) . PHP_EOL . PHP_EOL); break; } $wikiRevision = $revList->current(); $revision_id = $wikiRevision->getId(); /* -------------------- Author -------------------- **/ // An edge case where MediaWiki may give author as user_id 0, even though we dont have it // so we’ll give the first user instead. $contributor_id = $wikiRevision->getContributorId() === 0 ? 1 : $wikiRevision->getContributorId(); if (isset($this->users[$contributor_id])) { $contributor = clone $this->users[$contributor_id]; // We want a copy, because its specific to here only anyway. $wikiRevision->setContributor($contributor, false); } else { // In case we didn’t find data for $this->users[$contributor_id] $contributor = clone $this->users[1]; // We want a copy, because its specific to here only anyway. $wikiRevision->setContributor($contributor, false); } /* -------------------- /Author -------------------- **/ $output->writeln(sprintf(' - id: %d', $revision_id)); if ($displayIndex === true) { $output->writeln(sprintf(' index: %d', $revCounter)); } $persistArgs = $persistable->setRevision($wikiRevision)->getArgs(); foreach ($persistArgs as $argKey => $argVal) { if ($argKey === 'message') { $argVal = trim(mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100)); } if ($argKey === 'message' && empty($argVal)) { // Lets not pollute report with empty messages continue; } if ($displayAuthor === false && $argKey === 'author') { continue; } $output->writeln(sprintf(' %s: %s', $argKey, $argVal)); } if ($revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) { $output->writeln(' is_last_and_has_redirect: True'); } ++$revCounter; } /* ----------- REVISION --------------- */ $rev_count[] = $revs; // Which pages are directly on /wiki/foo. Are there some we // should move elsewhere such as the glossary items? if (count(explode('/', $title)) == 1 && $wikiDocument->hasRedirect() === false) { $directlyOnRoot[] = $title; } if ($revs > 99) { $moreThanHundredRevs[] = sprintf('%s (%d)', $title, $revs); } if ($is_translation === true && $wikiDocument->hasRedirect() === false) { $translations[] = $title; } // The ones with invalid URL characters that shouldn’t be part of // a page name because they may confuse with their natural use (:,(,),!,?) if ($title !== $normalized_location && $wikiDocument->hasRedirect() === false) { $sanity_redirs[$title] = $normalized_location; } // We have a number of pages, some of them had been // deleted or erased with a redirect left behind. // // Since we want to write to files all pages that currently // has content into a filesystem, we have to generate a file // name that can be stored into a filesystem. We therefore have // to normalize the names. // // We don’t want to have two entries with the same name. // // If a redirect (i.e. an empty file) exist, let’s set keep it // separate from the pages that still has content. // // Sanity check; // 1. Get list of redirects // 2. Get list of pages // // If we have a page duplicate, throw an exception! if ($wikiDocument->hasRedirect() === true) { // Pages we know are redirects within MediaWiki, we won’t // pass them within the $pages aray because they would be // empty content with only a redirect anyway. if ($normalized_location !== $redirect_to) { $redirects[str_replace('_', ' ', $normalized_location)] = $redirect_to; } } elseif (!in_array($normalized_location, array_keys($pages))) { // Pages we know has content, lets count them! if ($wikiDocument->hasRedirect() === false) { $pages[$normalized_location] = $title; } } elseif (in_array($title, $temporary_acceptable_duplicates)) { // Lets not throw, we got that covered. } else { // Hopefully we should never encounter this. $previous = $pages[$normalized_location]; $duplicatePagesExceptionText = 'We have duplicate entry for %s it ' . 'would be stored in %s which would override content of %s'; throw new Exception(sprintf($duplicatePagesExceptionText, $title, $file_path, $previous)); } $output->writeln(PHP_EOL . PHP_EOL); ++$counter; } } /* * Work some numbers on number of edits * * - Average * - Median */ $total_edits = 0; sort($rev_count); $edit_average = array_sum($rev_count) / $counter; // Calculate median $value_in_middle = floor(($counter - 1) / 2); if ($counter % 2) { // odd number, middle is the median $edit_median = $rev_count[$value_in_middle]; } else { // even number, calculate avg of 2 medians $low = $rev_count[$value_in_middle]; $high = $rev_count[$value_in_middle + 1]; $edit_median = ($low + $high) / 2; } $numbers = array('Numbers:'); $numbers[] = sprintf(' - "iterations": %d', $counter); $numbers[] = sprintf(' - "content pages": %d', count($pages)); $numbers[] = sprintf(' - "redirects": %d', count($redirects)); $numbers[] = sprintf(' - "translated": %d', count($translations)); $numbers[] = sprintf(' - "not in a directory": %d', count($directlyOnRoot)); $numbers[] = sprintf(' - "redirects for URL sanity": %d', count($sanity_redirs)); $numbers[] = sprintf(' - "edits average": %d', $edit_average); $numbers[] = sprintf(' - "edits median": %d', $edit_median); $this->filesystem->dumpFile('reports/numbers.txt', implode($numbers, PHP_EOL)); $this->filesystem->dumpFile('reports/hundred_revs.txt', implode($moreThanHundredRevs, PHP_EOL)); natcasesort($translations); $this->filesystem->dumpFile('reports/translations.txt', implode(PHP_EOL, $translations)); natcasesort($directlyOnRoot); $this->filesystem->dumpFile('reports/directly_on_root.txt', implode(PHP_EOL, $directlyOnRoot)); natcasesort($urlsWithContent); $this->filesystem->dumpFile('reports/url_all.txt', implode(PHP_EOL, $urlsWithContent)); natcasesort($urlParts); $this->filesystem->dumpFile('reports/url_parts.txt', implode(PHP_EOL, $urlParts)); // Creating list for https://github.com/webplatform/mediawiki-conversion/issues/2 ksort($urlPartsAll); $urlPartsAllOut = array('All words that exists in an URL, and the different ways they are written (needs harmonizing!):'); foreach ($urlPartsAll as $urlPartsAllKey => $urlPartsAllRow) { $urlPartsAllEntryUnique = array_unique($urlPartsAllRow); if (count($urlPartsAllEntryUnique) > 1) { $urlPartsAllOut[] = sprintf(' - %s', implode(', ', $urlPartsAllEntryUnique)); } } $this->filesystem->dumpFile('reports/url_parts_variants.txt', implode(PHP_EOL, $urlPartsAllOut)); ksort($redirects, SORT_NATURAL | SORT_FLAG_CASE); ksort($sanity_redirs, SORT_NATURAL | SORT_FLAG_CASE); $nginx_redirects = []; $nginx_redirects[] = 'rewrite ^/wiki/((Special|Template|User).*) /disabled?r=$1 permanent;'; $nginx_redirects[] = 'rewrite ^/w/(.*) /disabled?r=$1 permanent;'; $nginx_redirects[] = 'rewrite ^/$ /Main_Page permanent;'; $nginx_redirects[] = 'rewrite ^/wiki/?$ /Main_Page permanent;'; // /wiki/tutorials/canvas/canvas_tutorial //$nginx_redirects[] = 'rewrite ^/wiki/canvas/tutorial(.*)$ /wiki/tutorials/canvas$1 permanent;'; $nginx_redirects[] = 'rewrite ^/wiki/WPD\\:Community$ /community permanent;'; $nginx_redirects[] = 'rewrite ^/wiki/WPD\\:Contributors_Guide$ /contribute permanent;'; $nginx_esc[':'] = '\\:'; $nginx_esc['('] = '\\('; $nginx_esc[')'] = '\\)'; $nginx_esc[','] = '\\,'; $nginx_esc[' '] = '(\\ |_)'; // Ordering matter, otherwise the () will be escaped and we want them here! $prepare_nginx_redirects = array_merge($sanity_redirs, $redirects); foreach ($prepare_nginx_redirects as $url => $redirect_to) { // NGINX Case-insensitive redirect? Its done through (?i)! Should be documented!!! $nginx_redirects[] = sprintf('rewrite (?i)^/wiki/%s$ /%s permanent;', str_replace(array_keys($nginx_esc), $nginx_esc, $url), $redirect_to); } $nginx_redirects[] = 'rewrite ^/wiki/(.*) /$1 permanent;'; // Has to be the last! $this->filesystem->dumpFile('reports/nginx_redirects.map', implode(PHP_EOL, $nginx_redirects)); $sanity_redirects_out = array('URLs to return new Location (from => to):'); foreach ($sanity_redirs as $title => $sanitized) { $sanity_redirects_out[] = sprintf(' - "%s": "%s"', $title, $sanitized); } $this->filesystem->dumpFile('reports/sanity_redirects.txt', implode(PHP_EOL, $sanity_redirects_out)); $redirects_out = array('Redirects (from => to):'); foreach ($redirects as $url => $redirect_to) { $redirects_out[] = sprintf(' - "%s": "%s"', $url, $redirect_to); } $this->filesystem->dumpFile('reports/redirects.txt', implode(PHP_EOL, $redirects_out)); if ($listMissed === true) { $yaml = new Yaml\Dumper(); $yaml->setIndentation(2); try { $missed_out = $yaml->dump($missedIndexes, 3, 0, false, false); } catch (Exception $e) { $missed_out = sprintf('Could not create YAML out of missedIndexes array; Error was %s', $e->getMessage()); } $this->filesystem->dumpFile('reports/missed_retry_argument.txt', 'app/console mediawiki:run 3 --retry=' . implode(',', array_keys($missedIndexes))); $this->filesystem->dumpFile('reports/missed_entries.yml', 'Missed:' . PHP_EOL . $missed_out); $output->writeln('Created missed_retry_argument.txt and missed_entries.yml in reports/ you can try to recover!'); } }
/** * TEST TEST TEST * This method will try to return entities instead of a response * @TODO Use XML instead * @TODO Maybe use this https://github.com/prewk/xml-string-streamer-guzzle * @TODO Or this http://dk2.php.net/manual/en/function.xml-parse.php * @TODO Maybe create my own parser: http://php.net/manual/en/example.xml-structure.php * * @param int $page * @param int $pageSize * @return Response */ public function getProductPageAsEntities($page, $pageSize) { $response = $this->getProductPage($page, $pageSize); $stream = new Stream\Guzzle(''); $stream->setGuzzleStream($response->getBody()); $parser = new Parser\StringWalker(); $streamer = new XmlStringStreamer($parser, $stream); while ($node = $streamer->getNode()) { $xml = new \SimpleXMLElement($node, LIBXML_NOERROR); //$entity = new Entity\ProductData(); (yield $xml); } }
/** * */ public static function convertPendingXMLtoJSON($filename, $logger) { //Number of processed nodes (books) $count = 0; //Nombre del archivo local $local_file = DATA_PENDING_DIR . $filename; //Nombre del archivo final $json_file = DATA_OUTPUT_DIR . "{$filename}.json"; //Metodo para la obtención del tamaño del archivo XML local $totalSize = filesize($local_file); $start_timestamp = date('Y-m-d H:i:s'); // Se prepara el streaming y monitoreo con 16kb de buffer $progress = 0; $last_progress = 0; $stream = new File($local_file, 16384, function ($chunk, $readBytes) use($progress, &$last_progress, $totalSize, $logger) { $progress = $readBytes / $totalSize; //report every 10% if ($progress >= $last_progress + 0.1) { $logger->log("Progress: {$progress}"); $last_progress = $last_progress + 0.1; } }); $start_timestamp = date('Y-m-d H:i:s'); //Configura el parser $parser = new StringWalker(); //Configura el streamer $streamer = new XmlStringStreamer($parser, $stream); //Creación del archivo final $file = fopen($json_file, "w") or die(json_encode("Could not open {$json_file} for writing")); $logger->log("Convirtiendo {$local_file} a {$json_file}..."); //Procesamiento de nodos while ($node = $streamer->getNode()) { //Set json string ready for mongo insertion $json_string = Utils::getBookJSONFromXMLNode($node); //Inserta la cadena en el archivo final fputs($file, $json_string . PHP_EOL); $count++; } if ($count == 0) { $logger->error("0 Records converted"); } else { $logger->log("{$count} Records converted"); } //Cierra la edición del archivo final fclose($file); //Elimina la cache del proceso clearstatcache(); return $count; }
public function test_StringWalker_parser_with_file_shorter_than_buffer() { $file = __DIR__ . "/../../xml/short.xml"; $stream = new XmlStringStreamer\Stream\File($file, 1024); $parser = new XmlStringStreamer\Parser\StringWalker(); $streamer = new XmlStringStreamer($parser, $stream); $expectedNodes = array("foo", "bar"); $foundNodes = array(); while ($node = $streamer->getNode()) { $xmlNode = simplexml_load_string($node); $foundNodes[] = (string) $xmlNode->node; } $this->assertEquals($expectedNodes, $foundNodes, "The found nodes should equal the expected nodes"); }
protected function execute(InputInterface $input, OutputInterface $output) { $this->users = []; $this->filesystem = new Filesystem(); $this->titleFilter = new TitleFilter(); $passNbr = (int) $input->getArgument('pass'); $retries = explode(',', $input->getOption('retry')); $resumeAt = (int) $input->getOption('resume-at'); $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through $revMaxHops = (int) $input->getOption('max-revs'); // Maximum number of revisions per page we go through $listMissed = $input->getOption('missed'); $counter = 0; // Increment the number of pages we are going through $redirects = []; $pages = []; $urlParts = []; if (count($retries) >= 1 && $retries[0] !== '' && $passNbr !== 3) { throw new DomainException('Retry option is only supported at 3rd pass'); } if ($listMissed === true && $passNbr === 3) { $missed_file = DATA_DIR . '/missed.yml'; if (realpath($missed_file) === false) { throw new Exception(sprintf('Could not find missed file at %s', $missed_file)); } $missedFileContents = file_get_contents($missed_file); $parser = new Yaml\Parser(); try { $missed = $parser->parse($missedFileContents); } catch (Exception $e) { throw new Exception(sprintf('Could not get file %s contents to be parsed as YAML. Is it in YAML format?', $missed_file), null, $e); } if (!isset($missed['missed'])) { throw new Exception('Please ensure missed.yml has a list of titles under a "missed:" top level key'); } $this->missed = $missed['missed']; } elseif ($listMissed === true && $passNbr !== 3) { throw new DomainException('Missed option is only supported at 3rd pass'); } $repoInitialized = realpath(GIT_OUTPUT_DIR . '/.git') === false ? false : true; $this->git = new GitRepository(realpath(GIT_OUTPUT_DIR)); if ($repoInitialized === false) { $this->git->init()->execute(); } if ($passNbr === 3) { /* * Your MediaWiki API URL */ $apiUrl = MEDIAWIKI_API_ORIGIN . '/w/api.php?format=json&action=parse&prop=text|links|templates|'; $apiUrl .= 'images|externallinks|categories|sections|headitems|displaytitle|iwlinks|properties&pst=1'; $apiUrl .= '&disabletoc=true&disablepp=true&disableeditsection=true&preview=true&page='; // We are at conversion pass, instantiate our Converter! $this->converter = new MediaWikiToHtml(); $this->converter->setApiUrl($apiUrl); sort($retries); if (count($retries) === 1 && $retries[0] === '') { unset($retries); } } else { unset($retries); } /* -------------------- Author -------------------- * * Author array of MediaWikiContributor objects with $this->users[$uid], * where $uid is MediaWiki user_id. * * You may have to increase memory_limit value, * but we’ll load this only once. **/ $users_file = DATA_DIR . '/users.json'; $users_loop = json_decode(file_get_contents($users_file), 1); foreach ($users_loop as &$u) { $uid = (int) $u['user_id']; $this->users[$uid] = new MediaWikiContributor($u); unset($u); // Dont fill too much memory, if that helps. } /* -------------------- /Author -------------------- **/ /* -------------------- XML source -------------------- **/ $file = DATA_DIR . '/dumps/main_full.xml'; $streamer = XmlStringStreamer::createStringWalkerParser($file); /* -------------------- /XML source -------------------- **/ while ($node = $streamer->getNode()) { /** * 3rd pass, handle retries. * * This is useful if you went through all pages but some pages didn’t work. * We can ask to re-run only specific ones by using --retry= and a coma separated * list of index numbers (i.e. the $counter value we use for each page node). * * This set of case handles three situations only at 3rd pass AND when command has * --retry=n,n,n specified. * * 1. If current iteration ($counter) *matches* one of the $retries entries * * We want to let the process be executed through and added as a revision * * 2. Current iteration ($counter) *isn’t listed* in $retries; go to next. * * 3. We have no entries in $retries anymore, exit. * * ... THIS IS BOGUS, USE --missed INSTEAD! */ if (isset($retries) && in_array($indexCorrector, $retries)) { $retryNodeIndex = array_search($indexCorrector, $retries); unset($retries[$retryNodeIndex]); $output->writeln(PHP_EOL . sprintf('Will work on %d', $indexCorrector) . PHP_EOL); } elseif (isset($retries) && count($retries) >= 1) { ++$counter; continue; } elseif (isset($retries) && count($retries) === 0) { $output->writeln('No more retries to work with' . PHP_EOL); break; } /* * 3rd pass, handle interruption by telling where to resume work. * * This is useful if job stopped and you want to resume work back at a specific point. */ if ($counter < $resumeAt) { ++$counter; continue; } /* * Limit the number of pages we’ll work on. * * Useful if you want to test conversion script without going through all the content. */ if ($maxHops > 0 && $maxHops === $counter) { $output->writeln(sprintf('Reached desired maximum of %d documents', $maxHops) . PHP_EOL); break; } $pageNode = new SimpleXMLElement($node); if (isset($pageNode->title)) { $wikiDocument = new MediaWikiDocument($pageNode); $persistable = new GitCommitFileRevision($wikiDocument, 'out/content/', '.md'); $title = $wikiDocument->getTitle(); $normalized_location = $wikiDocument->getName(); $file_path = $this->titleFilter->filter($persistable->getName()); $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect()); // False if not a redirect, string if it is $is_translation = $wikiDocument->isTranslation(); $language_code = $wikiDocument->getLanguageCode(); $language_name = $wikiDocument->getLanguageName(); if ($listMissed === true && !in_array($normalized_location, $this->missed)) { ++$counter; continue; } if ($passNbr === 3 && $wikiDocument->hasRedirect() === false) { $random = rand(5, 10); $output->writeln(PHP_EOL . sprintf('--- sleep for %d to not break production ---', $random)); sleep($random); } $revs = $wikiDocument->getRevisions()->count(); $output->writeln(sprintf('"%s":', $title)); $output->writeln(sprintf(' - index: %d', $counter)); $output->writeln(sprintf(' - normalized: %s', $normalized_location)); $output->writeln(sprintf(' - file: %s', $file_path)); if ($wikiDocument->hasRedirect() === true) { $output->writeln(sprintf(' - redirect_to: %s', $redirect_to)); } if ($is_translation === true) { $output->writeln(sprintf(' - lang: %s (%s)', $language_code, $language_name)); } /* * Merge deleted content history under current content. * * 1st pass: Only those with redirects (i.e. deleted pages). Should leave an empty out/ directory! * 2nd pass: Only those without redirects (i.e. current content). * 3nd pass: Only for those without redirects, they are going to get the latest version passed through the convertor */ if ($wikiDocument->hasRedirect() === false && $passNbr === 1) { // Skip all NON redirects for pass 1 $output->writeln(sprintf(' - skip: Document %s WITHOUT redirect, at pass 1 (handling redirects)', $title) . PHP_EOL . PHP_EOL); ++$counter; continue; } elseif ($wikiDocument->hasRedirect() && $passNbr === 2) { // Skip all redirects for pass 2 $output->writeln(sprintf(' - skip: Document %s WITH redirect, at pass 2 (handling non redirects)', $title) . PHP_EOL . PHP_EOL); ++$counter; continue; } elseif ($wikiDocument->hasRedirect() && $passNbr === 3) { // Skip all redirects for pass 2 $output->writeln(sprintf(' - skip: Document %s WITH redirect, at pass 3', $title) . PHP_EOL . PHP_EOL); ++$counter; continue; } if ($passNbr < 1 || $passNbr > 3) { throw new DomainException('This command has only three pases.'); } foreach (explode('/', $normalized_location) as $urlDepth => $urlPart) { $urlParts[strtolower($urlPart)] = $urlPart; } $revList = $wikiDocument->getRevisions(); $revLast = $wikiDocument->getLatest(); $revCounter = 0; if ($passNbr === 3) { // Overwriting $revList for last pass we’ll // use for conversion. $revList = new SplDoublyLinkedList(); // Pass some data we already have so we can // get it in the converted document. if ($is_translation === true) { $revLast->setFrontMatter(array('lang' => $language_code)); } $revList->push($revLast); } else { $output->writeln(sprintf(' - revs: %d', $revs)); $output->writeln(sprintf(' - revisions:')); } /* ----------- REVISIONS --------------- **/ for ($revList->rewind(); $revList->valid(); $revList->next()) { if ($revMaxHops > 0 && $revMaxHops === $revCounter) { $output->writeln(sprintf(' - stop: Reached maximum %d revisions', $revMaxHops) . PHP_EOL . PHP_EOL); break; } $wikiRevision = $revList->current(); /* -------------------- Author -------------------- **/ // An edge case where MediaWiki may give author as user_id 0, even though we dont have it // so we’ll give the first user instead. $contributor_id = $wikiRevision->getContributorId() === 0 ? 1 : $wikiRevision->getContributorId(); /* * Fix duplicates and merge them as only one. * * Please adjust to suit your own. * * Queried using jq; * * cat data/users.json | jq '.[]|select(.user_real_name == "Renoir Boulanger")' */ //if (in_array($contributor_id, [172943, 173060])) { // $contributor_id = 10080; //} if (isset($this->users[$contributor_id])) { $contributor = clone $this->users[$contributor_id]; // We want a copy, because its specific to here only anyway. $wikiRevision->setContributor($contributor, false); } else { // In case we didn’t find data for $this->users[$contributor_id] $contributor = clone $this->users[1]; // We want a copy, because its specific to here only anyway. $wikiRevision->setContributor($contributor, false); } /* -------------------- /Author -------------------- **/ // Lets handle conversion only at 3rd pass. if ($passNbr === 3) { try { $revision = $this->converter->apply($wikiRevision); } catch (Exception $e) { $output->writeln(sprintf(' - ERROR: %s, left a note in errors/%d.txt', $e->getMessage(), $counter)); $this->filesystem->dumpFile(sprintf('errors/%d.txt', $counter), $e->getMessage()); ++$counter; continue; } // user_id 10080 is Renoirb (yours truly) $revision->setAuthor($this->users[10080]); $revision_id = $revLast->getId(); } else { $revision = $wikiRevision; $revision_id = $wikiRevision->getId(); $output->writeln(sprintf(' - id: %d', $revision_id)); $output->writeln(sprintf(' index: %d', $revCounter)); } $persistArgs = $persistable->setRevision($revision)->getArgs(); if ($passNbr < 3) { foreach ($persistArgs as $argKey => $argVal) { if ($argKey === 'message') { $argVal = mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100); } $output->writeln(sprintf(' %s: %s', $argKey, $argVal)); } } $removeFile = false; if ($passNbr < 3 && $revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) { $output->writeln(' is_last_and_has_redirect: True'); $removeFile = true; } $persistable->setRevision($revision); $this->filesystem->dumpFile($file_path, (string) $persistable); try { $this->git->add()->execute(preg_replace('/^out\\//', '', $file_path)); } catch (GitException $e) { $message = sprintf('Could not add file "%s" with title "%s" for revision %d', $file_path, $title, $revision_id); throw new Exception($message, null, $e); } if ($passNbr < 3) { // We won’t expose all WebPlatform user emails to the public. Instead, // we’ll create a bogus email alias based on their MediaWiki username. $real_name = $wikiRevision->getContributor()->getRealName(); $username = $wikiRevision->getContributor()->getName(); $email = sprintf('%s@%s', $username, COMMITER_ANONYMOUS_DOMAIN); $author_overload = sprintf('%s <%s>', $real_name, $email); try { $this->git->commit()->message($persistArgs['message'])->author('"' . $author_overload . '"')->date('"' . $persistArgs['date'] . '"')->allowEmpty()->execute(); } catch (GitException $e) { var_dump($this->git); $message = sprintf('Could not commit for revision %d', $revision_id); throw new Exception($message, null, $e); } if ($removeFile === true) { try { $this->git->rm()->execute(preg_replace('/^out\\//', '', $file_path)); } catch (GitException $e) { $message = sprintf('Could remove %s at revision %d', $file_path, $revision_id); throw new Exception($message, null, $e); } $this->git->commit()->message('Remove file; ' . $persistArgs['message'])->author('"' . $author_overload . '"')->date('"' . $persistArgs['date'] . '"')->allowEmpty()->execute(); $this->filesystem->remove($file_path); } } /* End of $passNubr === 3 */ ++$revCounter; } /* ----------- REVISIONS --------------- **/ $output->writeln(PHP_EOL); } ++$counter; } if ($passNbr === 3) { $output->writeln('3rd pass. One. Commit.' . PHP_EOL . PHP_EOL); try { $this->git->commit()->message($revision->getComment())->execute(); } catch (GitException $e) { var_dump($this->git); $message = sprintf('Could not commit for revision %d', $revision_id); throw new Exception($message, null, $e); } } }
public function test_UniqueNode_parser_with_file_with_data_in_last_chunk() { $file = __DIR__ . "/../../xml/short_last_chunk.xml"; $stream = new XmlStringStreamer\Stream\File($file, 200); $parser = $parser = new UniqueNode(array("uniqueNode" => 'capture')); $streamer = new XmlStringStreamer($parser, $stream); $foundNodes = 0; while ($node = $streamer->getNode()) { $foundNodes++; } $this->assertEquals(2, $foundNodes, "The found nodes should equal the expected nodes number."); }
// Se prepara el streaming y monitoreo con 16kb de buffer $progress = 0; $last_progress = 0; $stream = new File($local_file, 16384, function ($chunk, $readBytes) use($progress, &$last_progress, $totalSize, $logger) { $progress = $readBytes / $totalSize; //report every 10% if ($progress >= $last_progress + 0.1) { $logger->log("Progress: {$progress}"); $last_progress = $last_progress + 0.1; } }); $start_timestamp = date('Y-m-d H:i:s'); //Configura el parser $parser = new StringWalker(); //Configura el streamer $streamer = new XmlStringStreamer($parser, $stream); //Creación del archivo final $file = fopen($json_file, "w") or die(json_encode("Could not open {$json_file} for writing")); $logger->log("Convirtiendo {$local_file} a {$json_file}..."); //Procesamiento de nodos while ($node = $streamer->getNode()) { //Set json string ready for mongo insertion $json_string = Utils::getBookJSONFromXMLNode2($node); //Inserta la cadena en el archivo final fputs($file, $json_string . PHP_EOL); $count++; } //Cierra la edición del archivo final fclose($file); //Elimina la cache del proceso clearstatcache();
/** * __construct * * Builds the Chunk object * * @param string $file The filename to work with * @param array $options The options with which to parse the file * @author Dom Hastings * @access public */ public function __construct($file, $options = array(), $parser_type = false) { // merge the options together $this->options = array_merge($this->options, is_array($options) ? $options : array()); $this->options['chunkSize'] *= PMXI_Plugin::getInstance()->getOption('chunk_size'); // set the filename $this->file = $file; $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type; $is_html = false; $f = @fopen($file, "rb"); while (!@feof($f)) { $chunk = @fread($f, 1024); if (strpos($chunk, "<!DOCTYPE") === 0) { $is_html = true; } break; } @fclose($f); if ($is_html) { $path = $this->get_file_path(); $this->is_404 = true; $this->reader = new XMLReader(); @$this->reader->open($path); @$this->reader->setParserProperty(XMLReader::VALIDATE, false); return; } if (PMXI_Plugin::getInstance()->getOption('force_stream_reader')) { $this->parser_type = 'xmlstreamer'; } else { $input = new PMXI_Input(); $import_id = $input->get('id', 0); if (empty($import_id)) { $import_id = $input->get('import_id', 0); } if (!empty($import_id)) { $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type; $import = new PMXI_Import_Record(); $import->getById($import_id); if (!$import->isEmpty()) { $this->parser_type = empty($import->options['xml_reader_engine']) ? 'xmlreader' : 'xmlstreamer'; } } else { $this->parser_type = empty($parser_type) ? get_option('wpai_parser_type', 'xmlreader') : $parser_type; } } if (empty($this->options['element']) or $this->options['get_cloud']) { $path = $this->get_file_path(); if ($this->parser_type == 'xmlreader') { $reader = new XMLReader(); $reader->open($path); $reader->setParserProperty(XMLReader::VALIDATE, false); while (@$reader->read()) { switch ($reader->nodeType) { case XMLREADER::ELEMENT: $localName = str_replace("_colon_", ":", $reader->localName); if (array_key_exists(str_replace(":", "_", $localName), $this->cloud)) { $this->cloud[str_replace(":", "_", $localName)]++; } else { $this->cloud[str_replace(":", "_", $localName)] = 1; } break; default: break; } } unset($reader); } else { $CHUNK_SIZE = 1024; $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE); $parseroptions = array("extractContainer" => false); // Works like an XmlReader, and walks the XML tree node by node. Captures by node depth setting. $parser = new Parser\StringWalker($parseroptions); // Create the streamer $streamer = new XmlStringStreamer($parser, $streamProvider); while ($node = $streamer->getNode()) { // $simpleXmlNode = simplexml_load_string($node); // echo (string)$simpleXmlNode->firstName; } $this->cloud = $parser->cloud; } if (!empty($this->cloud) and empty($this->options['element'])) { arsort($this->cloud); $main_elements = array('node', 'product', 'job', 'deal', 'entry', 'item', 'property', 'listing', 'hotel', 'record', 'article', 'post', 'book', 'item_0'); foreach ($this->cloud as $element_name => $value) { if (in_array(strtolower($element_name), $main_elements)) { $this->options['element'] = $element_name; break; } } if (empty($this->options['element'])) { foreach ($this->cloud as $el => $count) { $this->options['element'] = $el; break; } } } } $path = $this->get_file_path(); if ($this->parser_type == 'xmlreader') { $this->reader = new XMLReader(); @$this->reader->open($path); @$this->reader->setParserProperty(XMLReader::VALIDATE, false); } else { $parseroptions = array("uniqueNode" => $this->options['element']); $CHUNK_SIZE = 1024; $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE); $parser = new Parser\UniqueNode($parseroptions); $this->reader = new XmlStringStreamer($parser, $streamProvider); } }