/** * GI that generates data in clusters, using a specified distribution for each * cluster. * * This GI requires the following template arguments: * - 'n' or 0 * The number of tuples to generate. Note: this value is per task. * The total number of tuples generated will be n_tasks * n * - 'centers' or 1 * A list of configuration for the centers. * * The following template arguments are optional: * - 'outputs' * If the outputs of the GI are not given implicitly, they can be * specified in this template argument. The number of dimensions will * be determined by the number of outputs. * * All output types must be numeric real types. The default type for * outputs is DOUBLE. * - 'dist.lib' = 'std' * Which library to use for generating distributions. * Valid options are: * - std * - boost * - 'seed' = null * The seed to be used for the random number generator. This seed will * be used to generate the seed for each task, and different runs with * the same seed will produce the same data. * - 'compute.sets' = 1 * The number of sets of tuples to compute at once. * * Each center configuration is a functor with the form: * dist_name(args) * * The following distributions are supported: * { Uniform Distributions } * - uniform(a = 0, b = 1) * * { Normal Distributions } * - normal(mean = 0.0, std_dev = 1.0) [ synonyms: gaussian ] * - inverse_gaussian(mean = 1, shape = 1) [ synonyms: inverse_normal ] * * { Bernoulli Distributions } * - binomial(t = 1, p = 0.5) * - negative_binomial(k = 1, p = 0.5) * * { Poisson Distributions } * - exponential( lambda = 1 ) * - gamma(alpha = 1, beta = 1) [ synonyms: Gamma ] */ function ClusterGen(array $t_args, array $outputs) { $sys_headers = ['array', 'cinttypes']; $user_headers = []; $libraries = []; if (\count($outputs) == 0) { grokit_assert(array_key_exists('outputs', $t_args), 'ClusterGen: No outputs specified'); $count = 0; foreach ($t_args['outputs'] as $type) { if (is_identifier($type)) { $type = lookupType($type); } grokit_assert(is_datatype($type), 'ClusterGen: Non data-type ' . $type . ' given as output'); $name = 'output' . $count++; $outputs[$name] = $type; } } foreach ($outputs as $name => &$type) { if (is_null($type)) { $type = lookupType('base::DOUBLE'); } else { grokit_assert($type->is('real'), 'ClusterGen: Non-real datatype ' . $type . ' given as output'); } } $myOutputs = []; foreach ($outputs as $name => $type) { $myOutputs[$name] = $type; } $tSize = \count($outputs); $seed = get_default($t_args, 'seed', null); if ($seed !== null) { grokit_assert(is_int($seed), 'ClusterGen: Seed must be an integer or null.'); } else { $user_headers[] = 'HashFunctions.h'; } $distLib = get_default($t_args, 'dist.lib', 'std'); $distNS = ''; switch ($distLib) { case 'std': $sys_headers[] = 'random'; $distNS = 'std'; break; case 'boost': $sys_headers[] = 'boost/random.hpp'; $distNS = 'boost::random'; $libraries[] = 'boost_random-mt'; if ($seed === null) { // Need random_device $sys_headers[] = 'boost/random/random_device.hpp'; $libraries[] = 'boost_system-mt'; } break; default: grokit_error('ClusterGen: Unknown RNG library ' . $distLib); } $distRNG = 'mt19937'; $RNGtype = $distNS . '::' . $distRNG; $nTuples = get_first_key($t_args, ['n', '0']); grokit_assert(is_int($nTuples), 'ClusterGen: the number of tuples to be produced must be an integer.'); $centers = get_first_key($t_args, ['centers', 1]); grokit_assert(is_array($centers), 'ClusterGen: centers must be an array of functors'); $handleDist = function ($name, $args, $oType) use($distNS) { $distName = ''; $distArgs = []; switch ($name) { case 'gaussian': case 'normal': $distName = $distNS . '::' . 'normal_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 2, 'ClusterGen: Normal distribution takes at most 2 arguments, ' . \count($args) . ' given'); $mean = get_default($args, ['mean', 0], 0.0); $sigma = get_default($args, ['std_dev', 'sigma', 1], 1.0); grokit_assert(is_numeric($mean), 'ClusterGen: mean parameter of binomial distribution must be a real number.'); grokit_assert(is_numeric($sigma), 'ClusterGen: sigma parameter of binomial distribution must be a real number.'); $mean = floatval($mean); $sigma = floatval($sigma); $distArgs = [$mean, $sigma]; break; case 'binomial': $distName = $distNS . '::' . 'binomial_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 2, 'ClusterGen: Binomial distribution takes at most 2 arguments, ' . \count($args) . ' given'); $t = get_default($args, ['t', 0], 1); $p = get_default($args, ['p', 1], 0.5); grokit_assert(is_int($t), 'ClusterGen: t parameter of binomial distribution must be an integer.'); grokit_assert(is_numeric($p), 'ClusterGen: p parameter of binomial distribution must be a real number.'); $p = floatval($p); grokit_assert($p >= 0 && $p <= 1, 'ClusterGen: p parameter of binomial distribution must be in the range [0, 1]'); grokit_assert($t >= 0, 'ClusterGen: t parameter of binomial distribution must be in the range [0, +inf)'); $distArgs = [$t, $p]; break; case 'negative_binomial': $distName = $distNS . '::' . 'negative_binomial_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 2, 'ClusterGen: Negative Binomial distribution takes at most 2 arguments, ' . \count($args) . ' given'); $k = get_default($args, ['k', 0], 1); $p = get_default($args, ['p', 1], 0.5); grokit_assert(is_int($k), 'ClusterGen: k parameter of binomial distribution must be an integer.'); grokit_assert(is_numeric($p), 'ClusterGen: p parameter of binomial distribution must be a real number.'); $p = floatval($p); grokit_assert($p > 0 && $p <= 1, 'ClusterGen: p parameter of negative binomial distribution must be in the range (0, 1]'); grokit_assert($k > 0, 'ClusterGen: k parameter of negative binomial distribution must be in the range (0, +inf)'); $distArgs = [$k, $p]; break; case 'inverse_gaussian': case 'inverse_normal': grokit_assert(\count($args) <= 2, 'ClusterGen: Inverse Gaussian distribution takes at most 2 arguments, ' . \count($args) . ' given'); $mean = get_default($args, ['mean', 0], 1); $shape = get_default($args, ['shape', 1], 1); grokit_assert(is_numeric($mean), 'ClusterGen: mean parameter of inverse gaussian distribution must be a real number.'); grokit_assert(is_numeric($shape), 'ClusterGen: shape parameter of inverse gaussian distribution must be a real number.'); $mean = floatval($mean); $shape = floatval($shape); grokit_assert($mean > 0, 'ClusterGen: mean of inverse gaussian distribution must be in range (0, inf)'); grokit_assert($shape > 0, 'ClusterGen: shape of inverse gaussian distribution must be in range (0, inf)'); $gen_args = ['output' => $oType, 'ns' => $distNS]; $distName = strval(lookupResource('datagen::InverseGaussianGen', $gen_args)); $distArgs = [$mean, $shape]; break; case 'uniform': $distName = $distNS . '::' . 'uniform_real_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 2, 'ClusterGen: Uniform distribution takes at most 2 arguments, ' . \count($args) . ' given'); $a = get_default($args, ['a', 0], 0.0); $b = get_default($args, ['b', 1], 1.0); grokit_assert(is_numeric($a), 'ClusterGen: `a` parameter of uniform distribution must be a real number.'); grokit_assert(is_numeric($b), 'ClusterGen: `b` parameter of uniform distribution must be a real number.'); $a = floatval($a); $b = floatval($b); grokit_assert($b >= $a, 'ClusterGen: `b` parameter of uniform distribution must be >= the `a` parameter.'); $distArgs = [$a, $b]; break; case 'exponential': $distName = $distNS . '::' . 'exponential_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 1, 'ClusterGen: Exponential distribution takes at most 1 argument.'); $lambda = get_default($args, ['lambda', 0], 1.0); grokit_assert(is_numeric($lambda), 'ClusterGen: `lambda` parameter of exponential distribution must be a real number.'); $lambda = floatval($lambda); grokit_assert($lambda > 0, 'ClusterGen: `lambda` parameter of exponential distribution must be in range (0, +inf).'); $distArgs = [$lambda]; break; case 'gamma': case 'Gamma': $distName = $distNS . '::' . 'gamma_distribution<' . $oType . '>'; grokit_assert(\count($args) <= 2, 'ClusterGen: Gamma distribution takes at most 2 arguments.'); $alpha = get_default($args, ['alpha', 0], 1.0); $beta = det_default($args, ['beta', 1], 1.0); grokit_assert(is_numeric($alpha), 'ClusterGen: `alpha` parameter of gamma distribution must be a real number.'); grokit_assert(is_numeric($beta), 'ClusterGen: `beta` parameter of gamma distribution must be a real number.'); $alpha = floatval($alpha); $beta = floatval($beta); $distArgs = [$alpha, $beta]; break; default: grokit_error('ClusterGen: Unknown distribution ' . $name . ' given for center'); } return [$distName, $distArgs]; }; $dists = []; $distArgs = []; $count = 0; $oType = ''; $nCenters = 1; reset($outputs); foreach ($centers as $val) { $cluster = $val; if (is_functor($val)) { $cluster = [$val]; } else { if (is_array($val)) { $nCenters = lcm($nCenters, \count($val)); } else { grokit_error('ClusterGen: center descriptions must be functors or list of functors'); } } $curDist = []; $curDistArgs = []; $curDistName = 'distribution' . $count++; $oType = strval(current($outputs)); $iCount = 0; foreach ($cluster as $functor) { grokit_assert(is_functor($functor), 'ClusterGen: center description must be a functor'); $vName = $curDistName . '_' . $iCount++; $ret = $handleDist($functor->name(), $functor->args(), $oType); $curDist[$vName] = $ret[0]; $curDistArgs[$vName] = $ret[1]; } next($outputs); $dists[$curDistName] = $curDist; $distArgs[$curDistName] = $curDistArgs; } // Determine the default number of sets to compute at a time. // We want to generate either $nTuples or 10,000 tuples, depending on which // is less. $defaultSetsTarget = min($nTuples, 10000); $setsToTarget = intval(ceil($defaultSetsTarget / $nCenters)); $computeSets = get_default($t_args, 'compute.sets', $setsToTarget); grokit_assert(is_int($computeSets) && $computeSets > 0, 'ClusterGen: compute.sets must be a positive integer, ' . $computeSets . ' given'); $className = generate_name('ClusterGen'); // For some BIZZARE reason, the $outputs array was getting modified while // traversing over the $dists array. Making a deep copy of the outputs and // then reassigning it seems to fix the issue. $outputs = $myOutputs; ?> class <?php echo $className; ?> { // The number of tuples to produce per task static constexpr size_t N = <?php echo $nTuples; ?> ; static constexpr size_t CacheSize = <?php echo $computeSets * $nCenters; ?> ; // Typedefs typedef std::tuple<<?php echo array_template('{val}', ', ', $outputs); ?> > Tuple; typedef std::array<Tuple, CacheSize> TupleArray; typedef TupleArray::const_iterator TupleIterator; typedef <?php echo $RNGtype; ?> RandGen; // Number of tuples produced. uintmax_t count; // Cache a number of outputs for efficiency TupleArray cache; TupleIterator cacheIt; // Random number generator RandGen rng; // Distributions <?php // This is the section causing issues. foreach ($dists as $name => $list) { foreach ($list as $vName => $type) { ?> <?php echo $type; ?> <?php echo $vName; ?> ; <?php } // foreach distribution } // foreach cluster set ?> // Helper function to generate tuples. void GenerateTuples(void) { <?php $tIndex = 0; foreach ($dists as $name => $list) { $lCenters = \count($list); // $nCenters has been defined to be the LCM of the number of centers in // any column, so $lCenter is guaranteed to divide evenly into // CacheSize ?> for( size_t index = 0; CacheSize > index; index += <?php echo $lCenters; ?> ) { <?php $index = 0; foreach ($list as $vName => $type) { ?> std::get<<?php echo $tIndex; ?> >(cache[index + <?php echo $index; ?> ]) = <?php echo $vName; ?> (rng); <?php $index++; } // foreach value in tuple ?> } <?php $tIndex++; } // foreach distribution ?> cacheIt = cache.cbegin(); } public: // Constructor <?php echo $className; ?> ( GIStreamProxy & _stream ) : cache() , cacheIt() , count(0) , rng() <?php foreach ($dists as $name => $list) { foreach ($list as $vName => $type) { ?> , <?php echo $vName; ?> (<?php echo implode(', ', $distArgs[$name][$vName]); ?> ) <?php } // foreach distribution } // foreach cluster set ?> { <?php if (is_null($seed)) { ?> <?php echo $distNS; ?> ::random_device rd; <?php } // if seed is null ?> RandGen::result_type seed = <?php echo is_null($seed) ? 'rd()' : "CongruentHash({$seed}, _stream.get_id() )"; ?> ; rng.seed(seed); cacheIt = cache.cend(); } // Destructor ~<?php echo $className; ?> (void) { } bool ProduceTuple(<?php echo typed_ref_args($outputs); ?> ) { if( N > count ) { if( cacheIt == cache.cend() ) { GenerateTuples(); } <?php $tIndex = 0; foreach ($outputs as $name => $type) { ?> <?php echo $name; ?> = std::get<<?php echo $tIndex; ?> >(*cacheIt); <?php $tIndex++; } // foreach output ?> ++cacheIt; ++count; return true; } else { return false; } } }; <?php return array('kind' => 'GI', 'name' => $className, 'output' => $outputs, 'system_headers' => $sys_headers, 'user_headers' => $user_headers, 'libraries' => $libraries); }
function hashComplex($val) { $hasher = hash_init('sha256'); if (is_array($val)) { hash_update($hasher, '['); // Ensure array is sorted by keys ksort($val); foreach ($val as $name => $v) { hash_update($hasher, $name); hash_update($hasher, '=>'); hash_update($hasher, hashComplex($v)); } hash_update($hasher, ']'); } else { if (is_gla($val)) { hash_update($hasher, 'gla'); hash_update($hasher, $val->hash()); } else { if (is_gf($val)) { hash_update($hasher, 'gf'); hash_update($hasher, $val->hash()); } else { if (is_gt($val)) { hash_update($hasher, 'gt'); hash_update($hasher, $val->hash()); } else { if (is_gist($val)) { hash_update($hasher, 'gist'); hash_update($hasher, $val->hash()); } else { if (is_gi($val)) { hash_update($hasher, 'gi'); hash_update($hasher, $val->hash()); } else { if (is_datatype($val)) { hash_update($hasher, 'datatype'); hash_update($hasher, $val->hash()); } else { if (is_functor($val)) { hash_update($hasher, 'functor'); hash_update($hasher, $val->hash()); } else { if (is_identifier($val)) { hash_update($hasher, 'identifier'); hash_update($hasher, $val->hash()); } else { if (is_attribute($val)) { hash_update($hasher, 'attribute'); hash_update($hasher, strval($val)); } else { if (is_int($val) || is_float($val) || is_string($val) || is_bool($val)) { hash_update($hasher, gettype($val)); hash_update($hasher, $val); } else { if (is_null($val)) { hash_update($hasher, 'null'); } else { $valType = is_object($val) ? get_class($val) : gettype($val); grokit_logic_error('Unable to hash unknown type ' . $valType); } } } } } } } } } } } } return hash_final($hasher); }