public function test_getNode()
 {
     $node = "<node><a>lorem</a><b>ipsum</b></node>";
     $parser = Mockery::mock("\\Prewk\\XmlStringStreamer\\ParserInterface");
     $parser->shouldReceive("getNodeFrom")->with(Mockery::type("\\Prewk\\XmlStringStreamer\\StreamInterface"))->once()->andReturn($node);
     $stream = Mockery::mock("\\Prewk\\XmlStringStreamer\\StreamInterface");
     $streamer = new XmlStringStreamer($parser, $stream);
     $this->assertEquals($node, $streamer->getNode(), "Node received from the parser should be what was expected");
 }
Example #2
0
 /**
  * @param string $url
  * @param XMLHandler $handler
  * @return int
  */
 public function parse($url, XMLHandler $handler)
 {
     $stream = new Stream\Guzzle($url, self::CHUNK_SIZE);
     $parser = new Parser\StringWalker();
     $streamer = new XmlStringStreamer($parser, $stream);
     $countOfProducts = 0;
     while ($node = $streamer->getNode()) {
         $simpleXmlNode = simplexml_load_string($node);
         $handler->perform($simpleXmlNode);
         $countOfProducts++;
     }
     return $countOfProducts;
 }
 public function run()
 {
     Countrycode::truncate();
     $CHUNK_SIZE = 1024;
     $streamProvider = new Stream\File(dirname(__FILE__) . "/countrycodes.xml", $CHUNK_SIZE);
     $config = array("uniqueNode" => "row");
     $parser = new Parser\UniqueNode($config);
     $streamer = new XmlStringStreamer($parser, $streamProvider);
     while ($node = $streamer->getNode()) {
         $simpleXmlNode = simplexml_load_string($node);
         Countrycode::create(['countrycode' => $simpleXmlNode->field[0], 'country' => $simpleXmlNode->field[1]]);
     }
     $this->command->info('Countrycode table seeded!');
 }
 /**
  * {@inheritdoc}
  */
 public function isSpamReferrer(Url $url)
 {
     $url = $url->toArray();
     if (!isset($url['registerableDomain'], $url['host'], $url['publicSuffix'])) {
         return false;
     }
     $provider = new File($this->file, 1024);
     $parser = new XmlStringStreamer\Parser\StringWalker();
     $streamer = new XmlStringStreamer($parser, $provider);
     while ($node = $streamer->getNode()) {
         $domain = (string) simplexml_load_string($node);
         if (in_array($domain, [$url['registerableDomain'], $url['host'], $url['publicSuffix']])) {
             return true;
         }
     }
     return false;
 }
Example #5
0
 /**
  * @param string|XmlStringStreamer $xml
  * @return object[]
  */
 public function unserialize($xml)
 {
     $hydrator = $this->buildHydrator('xml', 'hydrate');
     $class = $this->getOptions()->getClass();
     $classes = array();
     if ($xml instanceof XmlStringStreamer) {
         while ($node = $xml->getNode()) {
             $node = simplexml_load_string($node);
             $classes[] = $hydrator->hydrate((array) $node, new $class());
         }
     } else {
         $docElement = simplexml_load_string($xml)->children();
         $name = $docElement->getName();
         foreach ($docElement->{$name} as $node) {
             $classes[] = $hydrator->hydrate((array) $node, new $class());
         }
     }
     return $classes;
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $this->users = [];
     $this->filesystem = new Filesystem();
     $this->titleFilter = new TitleFilter();
     $displayIndex = $input->getOption('indexes');
     $displayAuthor = $input->getOption('display-author');
     $maxHops = (int) $input->getOption('max-pages');
     // Maximum number of pages we go through
     $revMaxHops = (int) $input->getOption('max-revs');
     // Maximum number of revisions per page we go through
     $listMissed = $input->getOption('missed');
     $counter = 0;
     // Increment the number of pages we are going through
     $redirects = [];
     $pages = [];
     $urlParts = [];
     $missedIndexes = [];
     $urlsWithContent = [];
     $moreThanHundredRevs = [];
     $translations = [];
     $sanity_redirs = [];
     $directlyOnRoot = [];
     $rev_count = [];
     // So we can know what’s the average
     // Pages we have to make sure aren’t duplicate on the CMS prior
     // to the final migration.
     $temporary_acceptable_duplicates = [];
     //$temporary_acceptable_duplicates[] = 'css/selectors/pseudo-classes/:lang'; // DONE
     if ($listMissed === true) {
         $output->writeln('We are going to try to give you XML indexes to use for --retry=..., we will therefore limit the revision loops to one.');
         $missed_file = DATA_DIR . '/missed.yml';
         if (realpath($missed_file) === false) {
             throw new Exception(sprintf('Could not find missed file at %s', $missed_file));
         }
         $missedFileContents = file_get_contents($missed_file);
         $parser = new Yaml\Parser();
         try {
             $missed = $parser->parse($missedFileContents);
         } catch (Exception $e) {
             throw new Exception(sprintf('Could not get file %s contents to be parsed as YAML. Is it in YAML format?', $missed_file), null, $e);
         }
         if (!isset($missed['missed'])) {
             throw new Exception('Please ensure missed.yml has a list of titles under a "missed:" top level key');
         }
         $revMaxHops = 1;
         $this->missed = $missed['missed'];
     }
     /**
      * Last minute redirects. Order matters.
      */
     $redirects['after'] = 'css/selectors/pseudo-elements/after';
     $redirects['tutorials/What_is_CSS'] = 'tutorials/learning_what_css_is';
     $redirects['html/attributes/type type (a, link, embed)'] = 'html/attributes/type';
     /* -------------------- Author --------------------
      *
      * Author array of MediaWikiContributor objects with $this->users[$uid],
      * where $uid is MediaWiki user_id.
      *
      * You may have to increase memory_limit value,
      * but we’ll load this only once.
      **/
     $users_file = DATA_DIR . '/users.json';
     $users_loop = json_decode(file_get_contents($users_file), 1);
     foreach ($users_loop as &$u) {
         $uid = (int) $u['user_id'];
         $this->users[$uid] = new MediaWikiContributor($u);
         unset($u);
         // Dont fill too much memory, if that helps.
     }
     /* -------------------- /Author -------------------- **/
     /* -------------------- XML source -------------------- **/
     $file = DATA_DIR . '/dumps/main_full.xml';
     $streamer = XmlStringStreamer::createStringWalkerParser($file);
     /* -------------------- /XML source -------------------- **/
     while ($node = $streamer->getNode()) {
         if ($maxHops > 0 && $maxHops === $counter) {
             $output->writeln(sprintf('Reached desired maximum of %d loops', $maxHops) . PHP_EOL . PHP_EOL);
             break;
         }
         $pageNode = new SimpleXMLElement($node);
         if (isset($pageNode->title)) {
             $wikiDocument = new MediaWikiDocument($pageNode);
             $persistable = new GitCommitFileRevision($wikiDocument, 'out/content/', '.md');
             $title = $wikiDocument->getTitle();
             $normalized_location = $wikiDocument->getName();
             $file_path = $this->titleFilter->filter($persistable->getName());
             $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect());
             // False if not a redirect, string if it is
             $is_translation = $wikiDocument->isTranslation();
             $language_code = $wikiDocument->getLanguageCode();
             $language_name = $wikiDocument->getLanguageName();
             $revs = $wikiDocument->getRevisions()->count();
             $output->writeln(sprintf('"%s":', $title));
             if ($displayIndex === true) {
                 $output->writeln(sprintf('  - index: %d', $counter));
             }
             $output->writeln(sprintf('  - normalized: %s', $normalized_location));
             $output->writeln(sprintf('  - file: %s', $file_path));
             if ($wikiDocument->hasRedirect() === true) {
                 $output->writeln(sprintf('  - redirect_to: %s', $redirect_to));
             } else {
                 $urlsWithContent[] = $title;
                 foreach (explode('/', $normalized_location) as $urlDepth => $urlPart) {
                     $urlPartKey = strtolower($urlPart);
                     $urlParts[$urlPartKey] = $urlPart;
                     $urlPartsAll[$urlPartKey][] = $urlPart;
                 }
             }
             if ($is_translation === true) {
                 $output->writeln(sprintf('  - lang: %s (%s)', $language_code, $language_name));
             }
             if ($listMissed === true && in_array($normalized_location, $this->missed)) {
                 $missedIndexes[$counter] = $title;
             }
             $output->writeln(sprintf('  - revs: %d', $revs));
             $output->writeln(sprintf('  - revisions:'));
             $revList = $wikiDocument->getRevisions();
             $revLast = $wikiDocument->getLatest();
             $revCounter = 0;
             /* ----------- REVISION --------------- **/
             for ($revList->rewind(); $revList->valid(); $revList->next()) {
                 if ($revMaxHops > 0 && $revMaxHops === $revCounter) {
                     $output->writeln(sprintf('    - stop: Reached maximum %d revisions', $revMaxHops) . PHP_EOL . PHP_EOL);
                     break;
                 }
                 $wikiRevision = $revList->current();
                 $revision_id = $wikiRevision->getId();
                 /* -------------------- Author -------------------- **/
                 // An edge case where MediaWiki may give author as user_id 0, even though we dont have it
                 // so we’ll give the first user instead.
                 $contributor_id = $wikiRevision->getContributorId() === 0 ? 1 : $wikiRevision->getContributorId();
                 if (isset($this->users[$contributor_id])) {
                     $contributor = clone $this->users[$contributor_id];
                     // We want a copy, because its specific to here only anyway.
                     $wikiRevision->setContributor($contributor, false);
                 } else {
                     // In case we didn’t find data for $this->users[$contributor_id]
                     $contributor = clone $this->users[1];
                     // We want a copy, because its specific to here only anyway.
                     $wikiRevision->setContributor($contributor, false);
                 }
                 /* -------------------- /Author -------------------- **/
                 $output->writeln(sprintf('    - id: %d', $revision_id));
                 if ($displayIndex === true) {
                     $output->writeln(sprintf('      index: %d', $revCounter));
                 }
                 $persistArgs = $persistable->setRevision($wikiRevision)->getArgs();
                 foreach ($persistArgs as $argKey => $argVal) {
                     if ($argKey === 'message') {
                         $argVal = trim(mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100));
                     }
                     if ($argKey === 'message' && empty($argVal)) {
                         // Lets not pollute report with empty messages
                         continue;
                     }
                     if ($displayAuthor === false && $argKey === 'author') {
                         continue;
                     }
                     $output->writeln(sprintf('      %s: %s', $argKey, $argVal));
                 }
                 if ($revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) {
                     $output->writeln('      is_last_and_has_redirect: True');
                 }
                 ++$revCounter;
             }
             /* ----------- REVISION --------------- */
             $rev_count[] = $revs;
             // Which pages are directly on /wiki/foo. Are there some we
             // should move elsewhere such as the glossary items?
             if (count(explode('/', $title)) == 1 && $wikiDocument->hasRedirect() === false) {
                 $directlyOnRoot[] = $title;
             }
             if ($revs > 99) {
                 $moreThanHundredRevs[] = sprintf('%s (%d)', $title, $revs);
             }
             if ($is_translation === true && $wikiDocument->hasRedirect() === false) {
                 $translations[] = $title;
             }
             // The ones with invalid URL characters that shouldn’t be part of
             // a page name because they may confuse with their natural use (:,(,),!,?)
             if ($title !== $normalized_location && $wikiDocument->hasRedirect() === false) {
                 $sanity_redirs[$title] = $normalized_location;
             }
             // We have a number of pages, some of them had been
             // deleted or erased with a redirect left behind.
             //
             // Since we want to write to files all pages that currently
             // has content into a filesystem, we have to generate a file
             // name that can be stored into a filesystem. We therefore have
             // to normalize the names.
             //
             // We don’t want to have two entries with the same name.
             //
             // If a redirect (i.e. an empty file) exist, let’s set keep it
             // separate from the pages that still has content.
             //
             // Sanity check;
             // 1. Get list of redirects
             // 2. Get list of pages
             //
             // If we have a page duplicate, throw an exception!
             if ($wikiDocument->hasRedirect() === true) {
                 // Pages we know are redirects within MediaWiki, we won’t
                 // pass them within the $pages aray because they would be
                 // empty content with only a redirect anyway.
                 if ($normalized_location !== $redirect_to) {
                     $redirects[str_replace('_', ' ', $normalized_location)] = $redirect_to;
                 }
             } elseif (!in_array($normalized_location, array_keys($pages))) {
                 // Pages we know has content, lets count them!
                 if ($wikiDocument->hasRedirect() === false) {
                     $pages[$normalized_location] = $title;
                 }
             } elseif (in_array($title, $temporary_acceptable_duplicates)) {
                 // Lets not throw, we got that covered.
             } else {
                 // Hopefully we should never encounter this.
                 $previous = $pages[$normalized_location];
                 $duplicatePagesExceptionText = 'We have duplicate entry for %s it ' . 'would be stored in %s which would override content of %s';
                 throw new Exception(sprintf($duplicatePagesExceptionText, $title, $file_path, $previous));
             }
             $output->writeln(PHP_EOL . PHP_EOL);
             ++$counter;
         }
     }
     /*
      * Work some numbers on number of edits
      *
      * - Average
      * - Median
      */
     $total_edits = 0;
     sort($rev_count);
     $edit_average = array_sum($rev_count) / $counter;
     // Calculate median
     $value_in_middle = floor(($counter - 1) / 2);
     if ($counter % 2) {
         // odd number, middle is the median
         $edit_median = $rev_count[$value_in_middle];
     } else {
         // even number, calculate avg of 2 medians
         $low = $rev_count[$value_in_middle];
         $high = $rev_count[$value_in_middle + 1];
         $edit_median = ($low + $high) / 2;
     }
     $numbers = array('Numbers:');
     $numbers[] = sprintf('  - "iterations": %d', $counter);
     $numbers[] = sprintf('  - "content pages": %d', count($pages));
     $numbers[] = sprintf('  - "redirects": %d', count($redirects));
     $numbers[] = sprintf('  - "translated": %d', count($translations));
     $numbers[] = sprintf('  - "not in a directory": %d', count($directlyOnRoot));
     $numbers[] = sprintf('  - "redirects for URL sanity": %d', count($sanity_redirs));
     $numbers[] = sprintf('  - "edits average": %d', $edit_average);
     $numbers[] = sprintf('  - "edits median": %d', $edit_median);
     $this->filesystem->dumpFile('reports/numbers.txt', implode($numbers, PHP_EOL));
     $this->filesystem->dumpFile('reports/hundred_revs.txt', implode($moreThanHundredRevs, PHP_EOL));
     natcasesort($translations);
     $this->filesystem->dumpFile('reports/translations.txt', implode(PHP_EOL, $translations));
     natcasesort($directlyOnRoot);
     $this->filesystem->dumpFile('reports/directly_on_root.txt', implode(PHP_EOL, $directlyOnRoot));
     natcasesort($urlsWithContent);
     $this->filesystem->dumpFile('reports/url_all.txt', implode(PHP_EOL, $urlsWithContent));
     natcasesort($urlParts);
     $this->filesystem->dumpFile('reports/url_parts.txt', implode(PHP_EOL, $urlParts));
     // Creating list for https://github.com/webplatform/mediawiki-conversion/issues/2
     ksort($urlPartsAll);
     $urlPartsAllOut = array('All words that exists in an URL, and the different ways they are written (needs harmonizing!):');
     foreach ($urlPartsAll as $urlPartsAllKey => $urlPartsAllRow) {
         $urlPartsAllEntryUnique = array_unique($urlPartsAllRow);
         if (count($urlPartsAllEntryUnique) > 1) {
             $urlPartsAllOut[] = sprintf(' - %s', implode(', ', $urlPartsAllEntryUnique));
         }
     }
     $this->filesystem->dumpFile('reports/url_parts_variants.txt', implode(PHP_EOL, $urlPartsAllOut));
     ksort($redirects, SORT_NATURAL | SORT_FLAG_CASE);
     ksort($sanity_redirs, SORT_NATURAL | SORT_FLAG_CASE);
     $nginx_redirects = [];
     $nginx_redirects[] = 'rewrite ^/wiki/((Special|Template|User).*) /disabled?r=$1 permanent;';
     $nginx_redirects[] = 'rewrite ^/w/(.*) /disabled?r=$1 permanent;';
     $nginx_redirects[] = 'rewrite ^/$ /Main_Page permanent;';
     $nginx_redirects[] = 'rewrite ^/wiki/?$ /Main_Page permanent;';
     //                             /wiki/tutorials/canvas/canvas_tutorial
     //$nginx_redirects[] = 'rewrite ^/wiki/canvas/tutorial(.*)$ /wiki/tutorials/canvas$1 permanent;';
     $nginx_redirects[] = 'rewrite ^/wiki/WPD\\:Community$ /community permanent;';
     $nginx_redirects[] = 'rewrite ^/wiki/WPD\\:Contributors_Guide$ /contribute permanent;';
     $nginx_esc[':'] = '\\:';
     $nginx_esc['('] = '\\(';
     $nginx_esc[')'] = '\\)';
     $nginx_esc[','] = '\\,';
     $nginx_esc[' '] = '(\\ |_)';
     // Ordering matter, otherwise the () will be escaped and we want them here!
     $prepare_nginx_redirects = array_merge($sanity_redirs, $redirects);
     foreach ($prepare_nginx_redirects as $url => $redirect_to) {
         // NGINX Case-insensitive redirect? Its done through (?i)! Should be documented!!!
         $nginx_redirects[] = sprintf('rewrite (?i)^/wiki/%s$ /%s permanent;', str_replace(array_keys($nginx_esc), $nginx_esc, $url), $redirect_to);
     }
     $nginx_redirects[] = 'rewrite ^/wiki/(.*) /$1 permanent;';
     // Has to be the last!
     $this->filesystem->dumpFile('reports/nginx_redirects.map', implode(PHP_EOL, $nginx_redirects));
     $sanity_redirects_out = array('URLs to return new Location (from => to):');
     foreach ($sanity_redirs as $title => $sanitized) {
         $sanity_redirects_out[] = sprintf(' - "%s": "%s"', $title, $sanitized);
     }
     $this->filesystem->dumpFile('reports/sanity_redirects.txt', implode(PHP_EOL, $sanity_redirects_out));
     $redirects_out = array('Redirects (from => to):');
     foreach ($redirects as $url => $redirect_to) {
         $redirects_out[] = sprintf(' - "%s": "%s"', $url, $redirect_to);
     }
     $this->filesystem->dumpFile('reports/redirects.txt', implode(PHP_EOL, $redirects_out));
     if ($listMissed === true) {
         $yaml = new Yaml\Dumper();
         $yaml->setIndentation(2);
         try {
             $missed_out = $yaml->dump($missedIndexes, 3, 0, false, false);
         } catch (Exception $e) {
             $missed_out = sprintf('Could not create YAML out of missedIndexes array; Error was %s', $e->getMessage());
         }
         $this->filesystem->dumpFile('reports/missed_retry_argument.txt', 'app/console mediawiki:run 3 --retry=' . implode(',', array_keys($missedIndexes)));
         $this->filesystem->dumpFile('reports/missed_entries.yml', 'Missed:' . PHP_EOL . $missed_out);
         $output->writeln('Created missed_retry_argument.txt and missed_entries.yml in reports/ you can try to recover!');
     }
 }
Example #7
0
 /**
  * TEST TEST TEST
  * This method will try to return entities instead of a response
  * @TODO Use XML instead
  * @TODO Maybe use this https://github.com/prewk/xml-string-streamer-guzzle
  * @TODO Or this http://dk2.php.net/manual/en/function.xml-parse.php
  * @TODO Maybe create my own parser: http://php.net/manual/en/example.xml-structure.php
  *
  * @param int $page
  * @param int $pageSize
  * @return Response
  */
 public function getProductPageAsEntities($page, $pageSize)
 {
     $response = $this->getProductPage($page, $pageSize);
     $stream = new Stream\Guzzle('');
     $stream->setGuzzleStream($response->getBody());
     $parser = new Parser\StringWalker();
     $streamer = new XmlStringStreamer($parser, $stream);
     while ($node = $streamer->getNode()) {
         $xml = new \SimpleXMLElement($node, LIBXML_NOERROR);
         //$entity = new Entity\ProductData();
         (yield $xml);
     }
 }
Example #8
0
 /**
  *
  */
 public static function convertPendingXMLtoJSON($filename, $logger)
 {
     //Number of processed nodes (books)
     $count = 0;
     //Nombre del archivo local
     $local_file = DATA_PENDING_DIR . $filename;
     //Nombre del archivo final
     $json_file = DATA_OUTPUT_DIR . "{$filename}.json";
     //Metodo para la obtención del tamaño del archivo XML local
     $totalSize = filesize($local_file);
     $start_timestamp = date('Y-m-d H:i:s');
     // Se prepara el streaming y monitoreo con 16kb de buffer
     $progress = 0;
     $last_progress = 0;
     $stream = new File($local_file, 16384, function ($chunk, $readBytes) use($progress, &$last_progress, $totalSize, $logger) {
         $progress = $readBytes / $totalSize;
         //report every 10%
         if ($progress >= $last_progress + 0.1) {
             $logger->log("Progress: {$progress}");
             $last_progress = $last_progress + 0.1;
         }
     });
     $start_timestamp = date('Y-m-d H:i:s');
     //Configura el parser
     $parser = new StringWalker();
     //Configura el streamer
     $streamer = new XmlStringStreamer($parser, $stream);
     //Creación del archivo final
     $file = fopen($json_file, "w") or die(json_encode("Could not open {$json_file} for writing"));
     $logger->log("Convirtiendo {$local_file} a {$json_file}...");
     //Procesamiento de nodos
     while ($node = $streamer->getNode()) {
         //Set json string ready for mongo insertion
         $json_string = Utils::getBookJSONFromXMLNode($node);
         //Inserta la cadena en el archivo final
         fputs($file, $json_string . PHP_EOL);
         $count++;
     }
     if ($count == 0) {
         $logger->error("0 Records converted");
     } else {
         $logger->log("{$count} Records converted");
     }
     //Cierra la edición del archivo final
     fclose($file);
     //Elimina la cache del proceso
     clearstatcache();
     return $count;
 }
 public function test_StringWalker_parser_with_file_shorter_than_buffer()
 {
     $file = __DIR__ . "/../../xml/short.xml";
     $stream = new XmlStringStreamer\Stream\File($file, 1024);
     $parser = new XmlStringStreamer\Parser\StringWalker();
     $streamer = new XmlStringStreamer($parser, $stream);
     $expectedNodes = array("foo", "bar");
     $foundNodes = array();
     while ($node = $streamer->getNode()) {
         $xmlNode = simplexml_load_string($node);
         $foundNodes[] = (string) $xmlNode->node;
     }
     $this->assertEquals($expectedNodes, $foundNodes, "The found nodes should equal the expected nodes");
 }
 protected function execute(InputInterface $input, OutputInterface $output)
 {
     $this->users = [];
     $this->filesystem = new Filesystem();
     $this->titleFilter = new TitleFilter();
     $passNbr = (int) $input->getArgument('pass');
     $retries = explode(',', $input->getOption('retry'));
     $resumeAt = (int) $input->getOption('resume-at');
     $maxHops = (int) $input->getOption('max-pages');
     // Maximum number of pages we go through
     $revMaxHops = (int) $input->getOption('max-revs');
     // Maximum number of revisions per page we go through
     $listMissed = $input->getOption('missed');
     $counter = 0;
     // Increment the number of pages we are going through
     $redirects = [];
     $pages = [];
     $urlParts = [];
     if (count($retries) >= 1 && $retries[0] !== '' && $passNbr !== 3) {
         throw new DomainException('Retry option is only supported at 3rd pass');
     }
     if ($listMissed === true && $passNbr === 3) {
         $missed_file = DATA_DIR . '/missed.yml';
         if (realpath($missed_file) === false) {
             throw new Exception(sprintf('Could not find missed file at %s', $missed_file));
         }
         $missedFileContents = file_get_contents($missed_file);
         $parser = new Yaml\Parser();
         try {
             $missed = $parser->parse($missedFileContents);
         } catch (Exception $e) {
             throw new Exception(sprintf('Could not get file %s contents to be parsed as YAML. Is it in YAML format?', $missed_file), null, $e);
         }
         if (!isset($missed['missed'])) {
             throw new Exception('Please ensure missed.yml has a list of titles under a "missed:" top level key');
         }
         $this->missed = $missed['missed'];
     } elseif ($listMissed === true && $passNbr !== 3) {
         throw new DomainException('Missed option is only supported at 3rd pass');
     }
     $repoInitialized = realpath(GIT_OUTPUT_DIR . '/.git') === false ? false : true;
     $this->git = new GitRepository(realpath(GIT_OUTPUT_DIR));
     if ($repoInitialized === false) {
         $this->git->init()->execute();
     }
     if ($passNbr === 3) {
         /*
          * Your MediaWiki API URL
          */
         $apiUrl = MEDIAWIKI_API_ORIGIN . '/w/api.php?format=json&action=parse&prop=text|links|templates|';
         $apiUrl .= 'images|externallinks|categories|sections|headitems|displaytitle|iwlinks|properties&pst=1';
         $apiUrl .= '&disabletoc=true&disablepp=true&disableeditsection=true&preview=true&page=';
         // We are at conversion pass, instantiate our Converter!
         $this->converter = new MediaWikiToHtml();
         $this->converter->setApiUrl($apiUrl);
         sort($retries);
         if (count($retries) === 1 && $retries[0] === '') {
             unset($retries);
         }
     } else {
         unset($retries);
     }
     /* -------------------- Author --------------------
      *
      * Author array of MediaWikiContributor objects with $this->users[$uid],
      * where $uid is MediaWiki user_id.
      *
      * You may have to increase memory_limit value,
      * but we’ll load this only once.
      **/
     $users_file = DATA_DIR . '/users.json';
     $users_loop = json_decode(file_get_contents($users_file), 1);
     foreach ($users_loop as &$u) {
         $uid = (int) $u['user_id'];
         $this->users[$uid] = new MediaWikiContributor($u);
         unset($u);
         // Dont fill too much memory, if that helps.
     }
     /* -------------------- /Author -------------------- **/
     /* -------------------- XML source -------------------- **/
     $file = DATA_DIR . '/dumps/main_full.xml';
     $streamer = XmlStringStreamer::createStringWalkerParser($file);
     /* -------------------- /XML source -------------------- **/
     while ($node = $streamer->getNode()) {
         /**
          * 3rd pass, handle retries.
          *
          * This is useful if you went through all pages but some pages didn’t work.
          * We can ask to re-run only specific ones by using --retry= and a coma separated
          * list of index numbers (i.e. the $counter value we use for each page node).
          *
          * This set of case handles three situations only at 3rd pass AND when command has
          * --retry=n,n,n specified.
          *
          * 1. If current iteration ($counter) *matches* one of the $retries entries
          *
          *    We want to let the process be executed through and added as a revision
          *
          * 2. Current iteration ($counter) *isn’t listed* in $retries; go to next.
          *
          * 3. We have no entries in $retries anymore, exit.
          *
          * ... THIS IS BOGUS, USE --missed INSTEAD!
          */
         if (isset($retries) && in_array($indexCorrector, $retries)) {
             $retryNodeIndex = array_search($indexCorrector, $retries);
             unset($retries[$retryNodeIndex]);
             $output->writeln(PHP_EOL . sprintf('Will work on %d', $indexCorrector) . PHP_EOL);
         } elseif (isset($retries) && count($retries) >= 1) {
             ++$counter;
             continue;
         } elseif (isset($retries) && count($retries) === 0) {
             $output->writeln('No more retries to work with' . PHP_EOL);
             break;
         }
         /*
          * 3rd pass, handle interruption by telling where to resume work.
          *
          * This is useful if job stopped and you want to resume work back at a specific point.
          */
         if ($counter < $resumeAt) {
             ++$counter;
             continue;
         }
         /*
          * Limit the number of pages we’ll work on.
          *
          * Useful if you want to test conversion script without going through all the content.
          */
         if ($maxHops > 0 && $maxHops === $counter) {
             $output->writeln(sprintf('Reached desired maximum of %d documents', $maxHops) . PHP_EOL);
             break;
         }
         $pageNode = new SimpleXMLElement($node);
         if (isset($pageNode->title)) {
             $wikiDocument = new MediaWikiDocument($pageNode);
             $persistable = new GitCommitFileRevision($wikiDocument, 'out/content/', '.md');
             $title = $wikiDocument->getTitle();
             $normalized_location = $wikiDocument->getName();
             $file_path = $this->titleFilter->filter($persistable->getName());
             $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect());
             // False if not a redirect, string if it is
             $is_translation = $wikiDocument->isTranslation();
             $language_code = $wikiDocument->getLanguageCode();
             $language_name = $wikiDocument->getLanguageName();
             if ($listMissed === true && !in_array($normalized_location, $this->missed)) {
                 ++$counter;
                 continue;
             }
             if ($passNbr === 3 && $wikiDocument->hasRedirect() === false) {
                 $random = rand(5, 10);
                 $output->writeln(PHP_EOL . sprintf('--- sleep for %d to not break production ---', $random));
                 sleep($random);
             }
             $revs = $wikiDocument->getRevisions()->count();
             $output->writeln(sprintf('"%s":', $title));
             $output->writeln(sprintf('  - index: %d', $counter));
             $output->writeln(sprintf('  - normalized: %s', $normalized_location));
             $output->writeln(sprintf('  - file: %s', $file_path));
             if ($wikiDocument->hasRedirect() === true) {
                 $output->writeln(sprintf('  - redirect_to: %s', $redirect_to));
             }
             if ($is_translation === true) {
                 $output->writeln(sprintf('  - lang: %s (%s)', $language_code, $language_name));
             }
             /*
              * Merge deleted content history under current content.
              *
              * 1st pass: Only those with redirects (i.e. deleted pages). Should leave an empty out/ directory!
              * 2nd pass: Only those without redirects (i.e. current content).
              * 3nd pass: Only for those without redirects, they are going to get the latest version passed through the convertor
              */
             if ($wikiDocument->hasRedirect() === false && $passNbr === 1) {
                 // Skip all NON redirects for pass 1
                 $output->writeln(sprintf('  - skip: Document %s WITHOUT redirect, at pass 1 (handling redirects)', $title) . PHP_EOL . PHP_EOL);
                 ++$counter;
                 continue;
             } elseif ($wikiDocument->hasRedirect() && $passNbr === 2) {
                 // Skip all redirects for pass 2
                 $output->writeln(sprintf('  - skip: Document %s WITH redirect, at pass 2 (handling non redirects)', $title) . PHP_EOL . PHP_EOL);
                 ++$counter;
                 continue;
             } elseif ($wikiDocument->hasRedirect() && $passNbr === 3) {
                 // Skip all redirects for pass 2
                 $output->writeln(sprintf('  - skip: Document %s WITH redirect, at pass 3', $title) . PHP_EOL . PHP_EOL);
                 ++$counter;
                 continue;
             }
             if ($passNbr < 1 || $passNbr > 3) {
                 throw new DomainException('This command has only three pases.');
             }
             foreach (explode('/', $normalized_location) as $urlDepth => $urlPart) {
                 $urlParts[strtolower($urlPart)] = $urlPart;
             }
             $revList = $wikiDocument->getRevisions();
             $revLast = $wikiDocument->getLatest();
             $revCounter = 0;
             if ($passNbr === 3) {
                 // Overwriting $revList for last pass we’ll
                 // use for conversion.
                 $revList = new SplDoublyLinkedList();
                 // Pass some data we already have so we can
                 // get it in the converted document.
                 if ($is_translation === true) {
                     $revLast->setFrontMatter(array('lang' => $language_code));
                 }
                 $revList->push($revLast);
             } else {
                 $output->writeln(sprintf('  - revs: %d', $revs));
                 $output->writeln(sprintf('  - revisions:'));
             }
             /* ----------- REVISIONS --------------- **/
             for ($revList->rewind(); $revList->valid(); $revList->next()) {
                 if ($revMaxHops > 0 && $revMaxHops === $revCounter) {
                     $output->writeln(sprintf('    - stop: Reached maximum %d revisions', $revMaxHops) . PHP_EOL . PHP_EOL);
                     break;
                 }
                 $wikiRevision = $revList->current();
                 /* -------------------- Author -------------------- **/
                 // An edge case where MediaWiki may give author as user_id 0, even though we dont have it
                 // so we’ll give the first user instead.
                 $contributor_id = $wikiRevision->getContributorId() === 0 ? 1 : $wikiRevision->getContributorId();
                 /*
                  * Fix duplicates and merge them as only one.
                  *
                  * Please adjust to suit your own.
                  *
                  * Queried using jq;
                  *
                  *     cat data/users.json | jq '.[]|select(.user_real_name == "Renoir Boulanger")'
                  */
                 //if (in_array($contributor_id, [172943, 173060])) {
                 //    $contributor_id = 10080;
                 //}
                 if (isset($this->users[$contributor_id])) {
                     $contributor = clone $this->users[$contributor_id];
                     // We want a copy, because its specific to here only anyway.
                     $wikiRevision->setContributor($contributor, false);
                 } else {
                     // In case we didn’t find data for $this->users[$contributor_id]
                     $contributor = clone $this->users[1];
                     // We want a copy, because its specific to here only anyway.
                     $wikiRevision->setContributor($contributor, false);
                 }
                 /* -------------------- /Author -------------------- **/
                 // Lets handle conversion only at 3rd pass.
                 if ($passNbr === 3) {
                     try {
                         $revision = $this->converter->apply($wikiRevision);
                     } catch (Exception $e) {
                         $output->writeln(sprintf('    - ERROR: %s, left a note in errors/%d.txt', $e->getMessage(), $counter));
                         $this->filesystem->dumpFile(sprintf('errors/%d.txt', $counter), $e->getMessage());
                         ++$counter;
                         continue;
                     }
                     // user_id 10080 is Renoirb (yours truly)
                     $revision->setAuthor($this->users[10080]);
                     $revision_id = $revLast->getId();
                 } else {
                     $revision = $wikiRevision;
                     $revision_id = $wikiRevision->getId();
                     $output->writeln(sprintf('    - id: %d', $revision_id));
                     $output->writeln(sprintf('      index: %d', $revCounter));
                 }
                 $persistArgs = $persistable->setRevision($revision)->getArgs();
                 if ($passNbr < 3) {
                     foreach ($persistArgs as $argKey => $argVal) {
                         if ($argKey === 'message') {
                             $argVal = mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100);
                         }
                         $output->writeln(sprintf('      %s: %s', $argKey, $argVal));
                     }
                 }
                 $removeFile = false;
                 if ($passNbr < 3 && $revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) {
                     $output->writeln('      is_last_and_has_redirect: True');
                     $removeFile = true;
                 }
                 $persistable->setRevision($revision);
                 $this->filesystem->dumpFile($file_path, (string) $persistable);
                 try {
                     $this->git->add()->execute(preg_replace('/^out\\//', '', $file_path));
                 } catch (GitException $e) {
                     $message = sprintf('Could not add file "%s" with title "%s" for revision %d', $file_path, $title, $revision_id);
                     throw new Exception($message, null, $e);
                 }
                 if ($passNbr < 3) {
                     // We won’t expose all WebPlatform user emails to the public. Instead,
                     // we’ll create a bogus email alias based on their MediaWiki username.
                     $real_name = $wikiRevision->getContributor()->getRealName();
                     $username = $wikiRevision->getContributor()->getName();
                     $email = sprintf('%s@%s', $username, COMMITER_ANONYMOUS_DOMAIN);
                     $author_overload = sprintf('%s <%s>', $real_name, $email);
                     try {
                         $this->git->commit()->message($persistArgs['message'])->author('"' . $author_overload . '"')->date('"' . $persistArgs['date'] . '"')->allowEmpty()->execute();
                     } catch (GitException $e) {
                         var_dump($this->git);
                         $message = sprintf('Could not commit for revision %d', $revision_id);
                         throw new Exception($message, null, $e);
                     }
                     if ($removeFile === true) {
                         try {
                             $this->git->rm()->execute(preg_replace('/^out\\//', '', $file_path));
                         } catch (GitException $e) {
                             $message = sprintf('Could remove %s at revision %d', $file_path, $revision_id);
                             throw new Exception($message, null, $e);
                         }
                         $this->git->commit()->message('Remove file; ' . $persistArgs['message'])->author('"' . $author_overload . '"')->date('"' . $persistArgs['date'] . '"')->allowEmpty()->execute();
                         $this->filesystem->remove($file_path);
                     }
                 }
                 /* End of $passNubr === 3 */
                 ++$revCounter;
             }
             /* ----------- REVISIONS --------------- **/
             $output->writeln(PHP_EOL);
         }
         ++$counter;
     }
     if ($passNbr === 3) {
         $output->writeln('3rd pass. One. Commit.' . PHP_EOL . PHP_EOL);
         try {
             $this->git->commit()->message($revision->getComment())->execute();
         } catch (GitException $e) {
             var_dump($this->git);
             $message = sprintf('Could not commit for revision %d', $revision_id);
             throw new Exception($message, null, $e);
         }
     }
 }
 public function test_UniqueNode_parser_with_file_with_data_in_last_chunk()
 {
     $file = __DIR__ . "/../../xml/short_last_chunk.xml";
     $stream = new XmlStringStreamer\Stream\File($file, 200);
     $parser = $parser = new UniqueNode(array("uniqueNode" => 'capture'));
     $streamer = new XmlStringStreamer($parser, $stream);
     $foundNodes = 0;
     while ($node = $streamer->getNode()) {
         $foundNodes++;
     }
     $this->assertEquals(2, $foundNodes, "The found nodes should equal the expected nodes number.");
 }
Example #12
0
 // Se prepara el streaming y monitoreo con 16kb de buffer
 $progress = 0;
 $last_progress = 0;
 $stream = new File($local_file, 16384, function ($chunk, $readBytes) use($progress, &$last_progress, $totalSize, $logger) {
     $progress = $readBytes / $totalSize;
     //report every 10%
     if ($progress >= $last_progress + 0.1) {
         $logger->log("Progress: {$progress}");
         $last_progress = $last_progress + 0.1;
     }
 });
 $start_timestamp = date('Y-m-d H:i:s');
 //Configura el parser
 $parser = new StringWalker();
 //Configura el streamer
 $streamer = new XmlStringStreamer($parser, $stream);
 //Creación del archivo final
 $file = fopen($json_file, "w") or die(json_encode("Could not open {$json_file} for writing"));
 $logger->log("Convirtiendo {$local_file} a {$json_file}...");
 //Procesamiento de nodos
 while ($node = $streamer->getNode()) {
     //Set json string ready for mongo insertion
     $json_string = Utils::getBookJSONFromXMLNode2($node);
     //Inserta la cadena en el archivo final
     fputs($file, $json_string . PHP_EOL);
     $count++;
 }
 //Cierra la edición del archivo final
 fclose($file);
 //Elimina la cache del proceso
 clearstatcache();
Example #13
0
 /**
  * __construct
  * 
  * Builds the Chunk object
  *
  * @param string $file The filename to work with
  * @param array $options The options with which to parse the file
  * @author Dom Hastings
  * @access public
  */
 public function __construct($file, $options = array(), $parser_type = false)
 {
     // merge the options together
     $this->options = array_merge($this->options, is_array($options) ? $options : array());
     $this->options['chunkSize'] *= PMXI_Plugin::getInstance()->getOption('chunk_size');
     // set the filename
     $this->file = $file;
     $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
     $is_html = false;
     $f = @fopen($file, "rb");
     while (!@feof($f)) {
         $chunk = @fread($f, 1024);
         if (strpos($chunk, "<!DOCTYPE") === 0) {
             $is_html = true;
         }
         break;
     }
     @fclose($f);
     if ($is_html) {
         $path = $this->get_file_path();
         $this->is_404 = true;
         $this->reader = new XMLReader();
         @$this->reader->open($path);
         @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
         return;
     }
     if (PMXI_Plugin::getInstance()->getOption('force_stream_reader')) {
         $this->parser_type = 'xmlstreamer';
     } else {
         $input = new PMXI_Input();
         $import_id = $input->get('id', 0);
         if (empty($import_id)) {
             $import_id = $input->get('import_id', 0);
         }
         if (!empty($import_id)) {
             $this->parser_type = empty($parser_type) ? 'xmlreader' : $parser_type;
             $import = new PMXI_Import_Record();
             $import->getById($import_id);
             if (!$import->isEmpty()) {
                 $this->parser_type = empty($import->options['xml_reader_engine']) ? 'xmlreader' : 'xmlstreamer';
             }
         } else {
             $this->parser_type = empty($parser_type) ? get_option('wpai_parser_type', 'xmlreader') : $parser_type;
         }
     }
     if (empty($this->options['element']) or $this->options['get_cloud']) {
         $path = $this->get_file_path();
         if ($this->parser_type == 'xmlreader') {
             $reader = new XMLReader();
             $reader->open($path);
             $reader->setParserProperty(XMLReader::VALIDATE, false);
             while (@$reader->read()) {
                 switch ($reader->nodeType) {
                     case XMLREADER::ELEMENT:
                         $localName = str_replace("_colon_", ":", $reader->localName);
                         if (array_key_exists(str_replace(":", "_", $localName), $this->cloud)) {
                             $this->cloud[str_replace(":", "_", $localName)]++;
                         } else {
                             $this->cloud[str_replace(":", "_", $localName)] = 1;
                         }
                         break;
                     default:
                         break;
                 }
             }
             unset($reader);
         } else {
             $CHUNK_SIZE = 1024;
             $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
             $parseroptions = array("extractContainer" => false);
             // Works like an XmlReader, and walks the XML tree node by node. Captures by node depth setting.
             $parser = new Parser\StringWalker($parseroptions);
             // Create the streamer
             $streamer = new XmlStringStreamer($parser, $streamProvider);
             while ($node = $streamer->getNode()) {
                 // $simpleXmlNode = simplexml_load_string($node);
                 // echo (string)$simpleXmlNode->firstName;
             }
             $this->cloud = $parser->cloud;
         }
         if (!empty($this->cloud) and empty($this->options['element'])) {
             arsort($this->cloud);
             $main_elements = array('node', 'product', 'job', 'deal', 'entry', 'item', 'property', 'listing', 'hotel', 'record', 'article', 'post', 'book', 'item_0');
             foreach ($this->cloud as $element_name => $value) {
                 if (in_array(strtolower($element_name), $main_elements)) {
                     $this->options['element'] = $element_name;
                     break;
                 }
             }
             if (empty($this->options['element'])) {
                 foreach ($this->cloud as $el => $count) {
                     $this->options['element'] = $el;
                     break;
                 }
             }
         }
     }
     $path = $this->get_file_path();
     if ($this->parser_type == 'xmlreader') {
         $this->reader = new XMLReader();
         @$this->reader->open($path);
         @$this->reader->setParserProperty(XMLReader::VALIDATE, false);
     } else {
         $parseroptions = array("uniqueNode" => $this->options['element']);
         $CHUNK_SIZE = 1024;
         $streamProvider = new Prewk\XmlStringStreamer\Stream\File($path, $CHUNK_SIZE);
         $parser = new Parser\UniqueNode($parseroptions);
         $this->reader = new XmlStringStreamer($parser, $streamProvider);
     }
 }