/** * A GLA that estimates the cardinality of a dataset using the HyperLogLog * algorithm, with a configurable number of bins. */ function HyperLogLog(array $t_args, array $input, array $output) { $debug = get_default($t_args, 'debug', 0); grokit_assert(\count($output) == 1, 'HyperLogLog produces only 1 value, ' . \count($output) . ' outputs given.'); $outputName = array_keys($output)[0]; $outputType = array_get_index($output, 0); if (is_null($outputType)) { $outputType = lookupType('BASE::BIGINT'); } $output[$outputName] = $outputType; grokit_assert($outputType->is('numeric'), 'BloomFilter output must be numeric!'); $exp = get_first_key_default($t_args, ['bins.exponent'], 4); grokit_assert(is_integer($exp), 'HyperLogLog bins.exponent must be an integer'); // Set limit of 2^24 bins, because states past 16MB start to get silly grokit_assert($exp >= 4 && $exp < 24, 'HyperLogLog bins.exponent must be in range [4, 24]'); $useBuiltinCtz = get_default($t_args, 'use.builtin.ctz', true); $ctzFunc = $useBuiltinCtz ? '__builtin_ctzl' : 'ctz'; $bins = pow(2, $exp); // Determine the value of alpha based on $exp switch ($exp) { case 4: $alpha = 0.673; break; case 5: $alpha = 0.697; break; case 6: $alpha = 0.709; break; default: $alpha = 0.7213000000000001 / (1 + 1.079 / $bins); } $className = generate_name('HyperLogLog'); ?> class <?php echo $className; ?> { // Number of bins for registers static constexpr const size_t NUM_BINS = <?php echo $bins; ?> ; // Number of bits used to index into registers, log2(NUM_BINS) static constexpr const size_t INDEX_BITS = <?php echo $exp; ?> ; // Mask used to obtain register index from hash value static constexpr const size_t INDEX_MASK = NUM_BINS - 1; // Alpha coefficient used to correct cardinality estimate. Based on NUM_BINS. static constexpr const long double ALPHA = <?php echo $alpha; ?> ; // Value of cardinality estimate after which we must apply the // large range correction static constexpr const long double LARGE_BREAKPOINT = (1.0 / 30.0) * <?php echo pow(2, 32); ?> ; // Constants for population count static constexpr const uint64_t m1 = 0x5555555555555555; static constexpr const uint64_t m2 = 0x3333333333333333; static constexpr const uint64_t m4 = 0x0f0f0f0f0f0f0f0f; static constexpr const uint64_t h01 = 0x0101010101010101; // The registers std::array<unsigned char, NUM_BINS> registers; // A count used to remember how many tuples were processed, mostly for debugging. size_t count; public: <?php echo $className; ?> (void) : registers() { for( auto & elem : registers ) { elem = 0; } } ~<?php echo $className; ?> () { } int popcount(uint64_t x) { // Put count of each 2 bits into those 2 bits x -= (x >> 1) & m1; // Put count of each 4 bits into those 4 bits x = (x & m2) + ((x >> 2) & m2); // Put count of each 8 bits into those 8 bits x = (x + (x >> 4)) & m4; // Returns left 8 bits of x + (x << 8) + (x << 16) + ... return (x * h01) >> 56; } int ctz(int64_t x) { return popcount((x & -x) - 1); } void AddItem( <?php echo const_typed_ref_args($input); ?> ) { count++; uint64_t hashVal = H_b; <?php foreach ($input as $name => $type) { ?> hashVal = CongruentHash(Hash(<?php echo $name; ?> ), hashVal); <?php } // for each input ?> const size_t registerIndex = hashVal & INDEX_MASK; uint64_t value = hashVal >> INDEX_BITS; unsigned char nZeros = <?php echo $ctzFunc; ?> (value); unsigned char & registerValue = registers[registerIndex]; registerValue = registerValue > nZeros ? registerValue : nZeros; } void AddState( <?php echo $className; ?> & other ) { for( size_t i = 0; NUM_BINS > i; i++ ) { unsigned char & rVal = registers[i]; unsigned char & oVal = other.registers[i]; rVal = rVal > oVal ? rVal : oVal; } } void GetResult( <?php echo $outputType; ?> & <?php echo $outputName; ?> ) { // Compute harmonic sum of registers and correct by alpha long double cardEst = 0; size_t nZeroRegisters = 0; for( auto elem : registers ) { long double power = - static_cast<long double>(elem); cardEst += std::pow(2.0, power); if( elem == 0 ) nZeroRegisters++; } const long double nBins = static_cast<long double>(NUM_BINS); const long double zeroBins = static_cast<long double>(nZeroRegisters); cardEst = 1 / cardEst; cardEst *= ALPHA * nBins * nBins; long double cardinality = cardEst; if( (cardEst < 2.5 * NUM_BINS) ) { //> // Possible small range correction if( nZeroRegisters > 0 ) { // Small range correction cardinality = nBins * std::log(nBins / zeroBins); } } // TODO: Figure out if the large range correction is needed for 64-bit // hashes. <?php echo $outputName; ?> = cardinality; } }; <?php $system_headers = ['cmath', 'array', 'cinttypes']; if ($debug > 0) { $system_headers[] = 'iostream'; } return ['kind' => 'GLA', 'name' => $className, 'input' => $input, 'output' => $output, 'result_type' => 'single', 'user_headers' => ['HashFunctions.h'], 'system_headers' => $system_headers]; }
/** * A GLA that estimates the cardinality of a dataset using a bloom filter of * a configurable size. * * Note: This filter has very high performance, so long as all of the states * fit into cache, preferably L1 or L2, but L3 is also fine. Once the states * are large enough that all of them cannot fit inside L3 cache at the same * time, performance takes a nose dive (4x loss minimum). */ function BloomFilter(array $t_args, array $input, array $output) { grokit_assert(\count($output) == 1, 'BloomFilter produces only 1 value, ' . \count($output) . ' outputs given.'); $outputName = array_keys($output)[0]; $outputType = array_get_index($output, 0); if (is_null($outputType)) { $outputType = lookupType('BASE::BIGINT'); } $output[$outputName] = $outputType; grokit_assert($outputType->is('numeric'), 'BloomFilter output must be numeric!'); $exp = get_first_key_default($t_args, ['exponent'], 16); grokit_assert(is_integer($exp), 'BloomFilter exponent must be an integer.'); grokit_assert($exp > 0 && $exp < 64, 'BloomFilter exponent must be in range (0,64), ' . $exp . ' given.'); $nullCheck = get_default($t_args, 'null.check', false); $nullable = []; if (is_bool($nullCheck)) { foreach ($input as $name => $type) { $nullable[$name] = $nullCheck; } } else { if (is_array($nullCheck)) { foreach ($input as $name => $type) { $nullable[$name] = false; } foreach ($nullCheck as $index => $n) { grokit_assert(is_string($n), 'BloomFilster null.check has invalid value at position ' . $index); grokit_assert(array_key_exists($n, $nullable), 'BloomFilster null.check has unknown input ' . $n . ' at position ' . $index); $nullable[$n] = true; } } else { grokit_error('BloomFilster null.check must be boolean or list of inputs to check for nulls'); } } $debug = get_default($t_args, 'debug', 0); $bits = pow(2, $exp); $bytes = ceil($bits / 8.0); // Calculate the number of bits set for every possible value of a byte $nBits = []; for ($i = 0; $i < 256; $i++) { $n = $i; $b = 0; while ($n > 0) { $n &= $n - 1; $b++; } $nBits[$i] = $b; } $className = generate_name('BloomFilter'); ?> class <?php echo $className; ?> { static constexpr size_t BITS = <?php echo $bits; ?> ; static constexpr size_t BYTES = <?php echo $bytes; ?> ; static constexpr size_t MASK = BITS - 1; static constexpr std::array<unsigned char, 256> BITS_SET = { <?php echo implode(', ', $nBits); ?> }; static constexpr std::array<unsigned char, 8> BIT_MASKS = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 }; size_t count; std::array<unsigned char, BYTES> set; //unsigned char set[BYTES]; //std::bitset<BITS> set; public: <?php echo $className; ?> () : count(0), set() { for( size_t i = 0; i < BYTES; i++ ) { //> set[i] = 0; } } ~<?php echo $className; ?> () { } void AddItem( <?php echo const_typed_ref_args($input); ?> ) { count++; <?php foreach ($nullable as $name => $check) { if ($check) { ?> if( IsNull( <?php echo $name; ?> ) ) return; <?php } // if checking for nulls } // foreach input ?> size_t hashVal = H_b; <?php foreach ($input as $name => $type) { ?> hashVal = CongruentHash(Hash(<?php echo $name; ?> ), hashVal); <?php } // foreach input ?> hashVal = hashVal & MASK; const size_t bucket = hashVal >> 3; const size_t bucket_index = hashVal & 0x07; const unsigned char mask = BIT_MASKS[bucket_index]; set[bucket] |= mask; } void AddState( <?php echo $className; ?> & o ) { count += o.count; for( size_t i = 0; i < BYTES; i++ ) { //> set[i] |= o.set[i]; } } void GetResult( <?php echo $outputType; ?> & <?php echo $outputName; ?> ) { size_t nBitsSet = 0; constexpr long double bits = static_cast<long double>(BITS); for( size_t i = 0; i < BYTES; i++ ) { //> nBitsSet += BITS_SET[set[i]]; } long double bitsSet = static_cast<long double>(nBitsSet); if( nBitsSet == BITS ) { // All Bits set, just give the cardinality as an estimate. <?php echo $outputName; ?> = count; } else { long double cardinality = - bits * std::log(1 - (bitsSet / bits)); <?php echo $outputName; ?> = cardinality; } <?php if ($debug > 0) { ?> std::cout << "BloomFilter:" << " bitsSet(" << bitsSet << ")" << " bits(" << bits << ")" << " cardinality(" << cardinality << ")" << " output(" << <?php echo $outputName; ?> << ")" << std::endl;; //> <?php } // if debugging enabled ?> } }; // Storage for static members constexpr std::array<unsigned char, 256> <?php echo $className; ?> ::BITS_SET; constexpr std::array<unsigned char, 8> <?php echo $className; ?> ::BIT_MASKS; <?php $system_headers = ['cmath', 'array']; if ($debug > 0) { $system_headers[] = 'iostream'; } return ['kind' => 'GLA', 'name' => $className, 'input' => $input, 'output' => $output, 'result_type' => 'single', 'user_headers' => ['HashFunctions.h'], 'system_headers' => $system_headers]; }
function CATEGORY(array $t_args) { if (array_key_exists('dict', $t_args)) { $values = $t_args['dict']; $maxID = 0; foreach ($values as $id => $val) { $maxID = \max($id, $maxID); } } else { $old_vals = get_first_key($t_args, ['values', 0]); $startAt = get_first_key_default($t_args, ['start.at'], 0); $values = []; $maxID = $startAt; foreach ($old_vals as $ind => $val) { $values[$maxID++] = $val; } } $cardinality = \count($values); // Add 1 to the cardinality for the invalid id $storageTypeBits = ceil(log($maxID + 1, 2)); if ($storageTypeBits > 64) { // This should never happen. PHP would explode processing 2^64 values. grokit_error("Unable to store {$cardinality} values within 64 bits."); } else { if ($storageTypeBits > 32) { $storageType = 'uint64_t'; $storageBytes = 8; } else { if ($storageTypeBits > 16) { $storageType = 'uint32_t'; $storageBytes = 4; } else { if ($storageTypeBits > 8) { $storageType = 'uint16_t'; $storageBytes = 2; } else { $storageType = 'uint8_t'; $storageBytes = 1; } } } } $className = generate_name('CATEGORY'); $stringType = lookupType('base::STRING'); $methods = []; $constructors = []; $functions = []; ?> class <?php echo $className; ?> { public: typedef <?php echo $storageType; ?> StorageType; typedef std::unordered_map<StorageType, std::string> IDToNameMap; typedef std::unordered_map<std::string, StorageType> NameToIDMap; static const StorageType InvalidID __attribute__((weak)); private: static const IDToNameMap idToName __attribute__((weak)); static const NameToIDMap nameToID __attribute__((weak)); // The ID of this categorical variable StorageType myID; public: /* ----- Constructors / Destructor ----- */ <?php echo $className; ?> ( void ); <?php $constructors[] = [['base::STRING_LITERAL'], true]; ?> <?php echo $className; ?> ( const char * ); <?php $constructors[] = [['base::STRING'], true]; ?> <?php echo $className; ?> ( const <?php echo $stringType; ?> & ); <?php echo $className; ?> ( const <?php echo $storageType; ?> ); <?php echo $className; ?> ( const <?php echo $className; ?> & ); <?php $constructors[] = [['BASE::NULL'], true]; ?> <?php echo $className; ?> ( const GrokitNull & ); <?php echo $className; ?> & operator =( const <?php echo $className; ?> & ) = default; ~<?php echo $className; ?> (void) {} /* ----- Methods ----- */ void FromString( const char * ); <?php $methods[] = ['ToString', [], 'base::STRING_LITERAL', true]; ?> const char * ToString( void ) const; StorageType GetID( void ) const; void SetID( StorageType id ); // Determines whether or not the category is valid. <?php $methods[] = ['Invalid', [], 'base::bool', true]; ?> bool Invalid(void) const; <?php $methods[] = ['Valid', [], 'base::bool', true]; ?> bool Valid(void) const; /* ----- Operators ----- */ bool operator ==( const <?php echo $className; ?> & ) const; bool operator !=( const <?php echo $className; ?> & ) const; bool operator <( const <?php echo $className; ?> & ) const; bool operator <=( const <?php echo $className; ?> & ) const; bool operator >( const <?php echo $className; ?> & ) const; bool operator >=( const <?php echo $className; ?> & ) const; // Implicit conversion to storage type operator <?php echo $storageType; ?> () const; // To/From Json void toJson( Json::Value & dest ) const; void fromJson( const Json::Value & src ); }; /* ----- Constructors ----- */ inline <?php echo $className; ?> :: <?php echo $className; ?> ( void ) : myID(InvalidID) { } inline <?php echo $className; ?> :: <?php echo $className; ?> ( const char * str ) { FromString(str); } inline <?php echo $className; ?> :: <?php echo $className; ?> ( const <?php echo $stringType; ?> & str ) { FromString(str.ToString()); } inline <?php echo $className; ?> :: <?php echo $className; ?> ( const <?php echo $storageType; ?> val ) : myID(val) { } inline <?php echo $className; ?> :: <?php echo $className; ?> ( const <?php echo $className; ?> & other ) : myID(other.myID) { } inline <?php echo $className; ?> :: <?php echo $className; ?> ( const GrokitNull & nullval ) : myID(InvalidID) { } /* ----- Methods ----- */ inline void <?php echo $className; ?> :: FromString( const char * str ) { auto it = nameToID.find(str); if( it != nameToID.end() ) { myID = it->second; } else { myID = InvalidID; } } inline const char * <?php echo $className; ?> :: ToString( void ) const { auto it = idToName.find(myID); if( it != idToName.end() ) { return it->second.c_str(); } else { return "NULL"; } } inline auto <?php echo $className; ?> :: GetID( void ) const -> StorageType { return myID; } inline void <?php echo $className; ?> :: SetID( StorageType id ) { myID = id; } inline bool <?php echo $className; ?> :: Valid(void) const { return idToName.count(myID) > 0; } inline bool <?php echo $className; ?> :: Invalid(void) const { return ! Valid(); } /* ----- Operators ----- */ inline bool <?php echo $className; ?> :: operator ==( const <?php echo $className; ?> & other ) const { return myID == other.myID; } inline bool <?php echo $className; ?> :: operator !=( const <?php echo $className; ?> & other ) const { return myID != other.myID; } inline bool <?php echo $className; ?> :: operator <( const <?php echo $className; ?> & other ) const { return myID < other.myID; } inline bool <?php echo $className; ?> :: operator >( const <?php echo $className; ?> & other ) const { return myID > other.myID; } inline bool <?php echo $className; ?> :: operator <=( const <?php echo $className; ?> & other ) const { return myID <= other.myID; } inline bool <?php echo $className; ?> :: operator >=( const <?php echo $className; ?> & other ) const { return myID >= other.myID; } // To/From Json inline void <?php echo $className; ?> :: toJson( Json::Value & dest ) const { dest = (Json::Int64) myID; } inline void <?php echo $className; ?> :: fromJson( const Json::Value & src ) { myID = (StorageType) src.asInt64(); } inline <?php echo $className; ?> :: operator <?php echo $storageType; ?> () const { return myID; } <?php ob_start(); $functions[] = ['Hash', ['@type'], 'base::BIGINT', true, true]; ?> template<> inline uint64_t Hash(const @type & thing) { return thing.GetID(); } inline void FromString( @type & c, const char * str ) { c.FromString(str); } inline int ToString( const @type & c, char * buffer ) { const char * str = c.ToString(); strcpy( buffer, str); int len = strlen(buffer); return len + 1; } inline void ToJson( const @type & src, Json::Value & dest ) { src.toJson(dest); } inline void FromJson( const Json::Value & src, @type & dest ) { dest.fromJson(src); } <?php $functions[] = ['IsNull', ['@type'], 'BASE::BOOL', true, true]; ?> inline bool IsNull( const @type c ) { return c.Invalid(); } <?php $globalContents = ob_get_clean(); ?> // Initialize static values const <?php echo $className; ?> ::IDToNameMap <?php echo $className; ?> :: idToName = { <?php echo array_template('{{key},"{val}"}', ',', $values); ?> }; const <?php echo $className; ?> ::NameToIDMap <?php echo $className; ?> :: nameToID = { <?php echo array_template('{"{val}",{key}}', ',', $values); ?> }; const <?php echo $className; ?> ::StorageType <?php echo $className; ?> :: InvalidID = std::numeric_limits<<?php echo $className; ?> ::StorageType>::max(); <?php return ['kind' => 'TYPE', 'name' => $className, 'properties' => ['categorical'], 'extras' => ['cardinality' => $cardinality, 'size.bytes' => $storageBytes], 'binary_operators' => ['==', '!=', '<', '>', '<=', '>='], 'system_headers' => ['cinttypes', 'unordered_map', 'string', 'cstring', 'limits'], 'global_content' => $globalContents, 'complex' => false, 'methods' => $methods, 'constructors' => $constructors, 'functions' => $functions, 'describe_json' => DescribeJson('factor', DescribeJsonStatic(['levels' => $values]))]; }
function BernoulliSample($t_args, $inputs) { $p = get_first_key_default($t_args, ['p', 0], 0.5); grokit_assert(is_float($p) || is_integer($p), "BernoulliSample: p must be a number in the range [0, 1]"); grokit_assert($p >= 0 && $p <= 1, "BernoulliSample: p must be in the range [0, 1]"); $rng = get_first_key_default($t_args, ['rng'], 'mt19937_64'); $sys_headers = ['random']; // assuming std $libs = []; // assuming std $ns = 'std'; // assuming standard library $cState = lookupResource('base::BernoulliSampleState', ['sys_headers' => $sys_headers, 'libs' => $libs, 'namespace' => $ns]); $name = generate_name('BernoulliSample'); ?> class <?php echo $name; ?> { public: using state_type = <?php echo $cState; ?> ; using rng_type = <?php echo $ns; ?> ::<?php echo $rng; ?> ; using dist_type = <?php echo $ns; ?> ::bernoulli_distribution; private: rng_type rng; dist_type bernoulli; public: const constexpr static double P = <?php echo $p; ?> ; <?php echo $name; ?> (state_type & _state): rng(_state()), bernoulli(P) { } bool Filter( <?php echo const_typed_ref_args($inputs); ?> ) { return bernoulli(rng); } }; <?php return ['kind' => 'GF', 'name' => $name, 'input' => $inputs, 'system_headers' => $sys_headers, 'libraries' => $libs, 'generated_state' => $cState]; }
function FACTOR(array $t_args) { $rawDict = get_first_key($t_args, ['dictionary', 'dict', 0]); // Double the quotes so that we escape them in SQLite, and add backslashes // to them so that we escape them in C++. $dict = addcslashes(\grokit\doubleChars($rawDict, '"'), '"\\'); $cardinality = \grokit\dictionarySize($rawDict); $storageBytes = get_first_key_default($t_args, ['bytes', 1], 2); $cardBytes = $cardinality > 0 ? intval(ceil(log($cardinality, 256))) : 1; $storageBytes = $cardBytes > $storageBytes ? $cardBytes : $storageBytes; switch ($storageBytes) { case 1: $storageType = 'uint8_t'; break; case 2: $storageType = 'uint16_t'; break; case 4: $storageType = 'uint32_t'; break; case 8: $storageType = 'uint64_t'; break; default: grokit_error('Unsupported # of bytes (' . $storageBytes . ') given for FACTOR, only 1, 2, 4, and 8 supported.'); } $className = generate_name('FACTOR_' . ensure_identifier($dict)); $stringType = lookupType('base::STRING'); $globalContent = ''; $methods = []; $constructors = []; $functions = []; ?> class <?php echo $className; ?> { public: typedef <?php echo $storageType; ?> StorageType; static const char * DictionaryName __attribute__((weak)); static const StorageType InvalidID __attribute__((weak)); static const StorageType MaxID __attribute__((weak)); static const Dictionary & globalDictionary __attribute__((weak)); public: /* ----- Members ----- */ // The ID of this Factor; StorageType myID; /* ----- Constructors / Destructors ----- */ // Default constructor <?php echo $className; ?> ( void ); // Constructor from null (same as default) <?php echo $className; ?> ( const GrokitNull & ); // Constructor from C strings / string literals <?php $constructors[] = [['base::STRING_LITERAL'], true]; ?> <?php echo $className; ?> ( const char * ); // Constructor from Grokit STRING type. <?php $constructors[] = [['base::STRING'], true]; ?> <?php echo $className; ?> ( const <?php echo $stringType; ?> & ); // Constructor from storage type <?php echo $className; ?> ( const StorageType ); // Copy constructor and copy assignment // These can both be default <?php echo $className; ?> ( const <?php echo $className; ?> & ) = default; <?php echo $className; ?> & operator =( const <?php echo $className; ?> & ) = default; // Destructor ~<?php echo $className; ?> () { } /* ----- Methods ----- */ // Standard FromString method void FromString( const char * ); // FromString method used when building the dictionaries. void FromString( const char *, Dictionary & ); // Looks up the factor in the global dictionary and returns the string <?php $methods[] = ['ToString', [], 'base::STRING_LITERAL', true]; ?> const char * ToString( void ) const; // Returns the ID of the Factor. StorageType GetID( void ) const; // Returns whether or not the Factor is valid. <?php $methods[] = ['Valid', [], 'base::bool', true]; ?> bool Valid( void ) const; <?php $methods[] = ['Invalid', [], 'base::bool', true]; ?> bool Invalid( void ) const; // Translate the content void Translate( const Dictionary::TranslationTable& ); void toJson( Json::Value & dest ) const; void fromJson( const Json::Value & src ); /* ----- Operators ----- */ // The dictionary keeps track of what the sorted order of the strings is. // These methods are based on the lexicographical ordering of the strings // the factors represent bool operator ==( const <?php echo $className; ?> & ) const; bool operator !=( const <?php echo $className; ?> & ) const; bool operator <( const <?php echo $className; ?> & ) const; bool operator <=( const <?php echo $className; ?> & ) const; bool operator >( const <?php echo $className; ?> & ) const; bool operator >=( const <?php echo $className; ?> & ) const; // Implicit conversion to storage type operator StorageType () const; }; // Static member initialization const <?php echo $className; ?> ::StorageType <?php echo $className; ?> ::InvalidID = std::numeric_limits<StorageType>::max(); const <?php echo $className; ?> ::StorageType <?php echo $className; ?> ::MaxID = <?php echo $className; ?> ::InvalidID - 1; const char * <?php echo $className; ?> ::DictionaryName = "<?php echo $dict; ?> "; const Dictionary & <?php echo $className; ?> ::globalDictionary = Dictionary::GetDictionary(<?php echo $className; ?> ::DictionaryName); /* ----- Constructors ----- */ // Default constructor inline <?php echo $className; ?> :: <?php echo $className; ?> ( void ): myID(InvalidID) {} inline <?php echo $className; ?> :: <?php echo $className; ?> ( const GrokitNull & nullval ): myID(InvalidID) { } // Constructor from C strings / string literals inline <?php echo $className; ?> :: <?php echo $className; ?> ( const char * str ) { FromString(str); } // Constructor from Grokit STRING type inline <?php echo $className; ?> :: <?php echo $className; ?> ( const <?php echo $stringType; ?> & str ) { FromString(str.ToString()); } // Constructor from storage type inline <?php echo $className; ?> :: <?php echo $className; ?> ( const <?php echo $storageType; ?> id ): myID(id) { } /* ----- Methods ----- */ inline auto <?php echo $className; ?> :: GetID(void) const -> StorageType { return myID; } // Standard FromString method inline void <?php echo $className; ?> :: FromString( const char * str ) { // Global dictionary will return InvalidID if not found myID = globalDictionary.Lookup(str, InvalidID ); } // FromString method used when building the dictionaries inline void <?php echo $className; ?> :: FromString( const char * str, Dictionary & localDict ) { // First check if we are in the local dictionary myID = localDict.Lookup(str, InvalidID ); if( myID != InvalidID ) return; // Next check if we are in the global dictionary myID = globalDictionary.Lookup(str, InvalidID ); if( myID != InvalidID ) return; // Add a new entry to the local dictionary. // The dictionary should throw an error if the new ID is greater than // MaxID. myID = localDict.Insert( str, MaxID ); } // Looks up the factor in the global dictionary and returns the string inline const char * <?php echo $className; ?> :: ToString( void ) const { return globalDictionary.Dereference(myID); } // Determine whether or not the factor is valid inline bool <?php echo $className; ?> :: Valid( void ) const { return myID != InvalidID; } inline bool <?php echo $className; ?> :: Invalid(void) const { return myID == InvalidID; } // Translate the content inline void <?php echo $className; ?> :: Translate( const Dictionary::TranslationTable & tbl ) { auto it = tbl.find(myID); if( it != tbl.end() ) { myID = it->second; } } inline void <?php echo $className; ?> :: toJson( Json::Value & dest ) const { dest = (Json::Int64) myID; } inline void <?php echo $className; ?> :: fromJson( const Json::Value & src ) { myID = (StorageType) src.asInt64(); } /* ----- Operators ----- */ inline bool <?php echo $className; ?> :: operator ==( const <?php echo $className; ?> & o ) const { return myID == o.myID; } inline bool <?php echo $className; ?> :: operator !=( const <?php echo $className; ?> & o ) const { return myID != o.myID; } inline bool <?php echo $className; ?> :: operator <( const <?php echo $className; ?> & o ) const { return Valid() && o.Valid() && globalDictionary.Compare(myID, o.myID) < 0; } inline bool <?php echo $className; ?> :: operator <=( const <?php echo $className; ?> & o ) const { return Valid() && o.Valid() && globalDictionary.Compare(myID, o.myID) <= 0; } inline bool <?php echo $className; ?> :: operator >( const <?php echo $className; ?> & o ) const { return Valid() && o.Valid() && globalDictionary.Compare(myID, o.myID) > 0; } inline bool <?php echo $className; ?> :: operator >=( const <?php echo $className; ?> & o ) const { return Valid() && o.Valid() && globalDictionary.Compare(myID, o.myID) >= 0; } // Implicit conversion to storage type inline <?php echo $className; ?> :: operator StorageType() const { return myID; } <?php ob_start(); // Global functions ?> inline void FromString( @type & f, const char * str ) { f.FromString(str); } inline void FromString( @type & f, const char * str, Dictionary & localDict ) { f.FromString(str, localDict); } inline int ToString( const @type & f, char * buffer ) { const char * str = f.ToString(); strcpy(buffer, str); return strlen(buffer) + 1; } <?php $functions[] = ['Hash', ['@type'], 'base::BIGINT', true, true]; ?> template<> inline uint64_t Hash( const @type & x ) { return x.GetID(); } inline void ToJson( const @type & src, Json::Value & dest ) { src.toJson(dest); } inline void FromJson( const Json::Value & src, @type & dest ) { dest.fromJson(src); } <?php $functions[] = ['IsNull', ['@type'], 'BASE::BOOL', true, true]; ?> inline bool IsNull( const @type f ) { return f.Invalid(); } <?php $globalContent .= ob_get_clean(); ?> <?php // Function to get the dictionary at runtime. $describeInfoJson = function ($var, $myType) { ?> <?php echo $var; ?> ["levels"] = Json::Value(Json::arrayValue); for( auto it = <?php echo $myType; ?> ::globalDictionary.cbegin(); it != <?php echo $myType; ?> ::globalDictionary.cend(); it++ ) { <?php echo $var; ?> ["levels"][it->first] = it->second; } <?php }; return ['kind' => 'TYPE', 'name' => $className, 'dictionary' => $dict, 'system_headers' => ['limits', 'cstring', 'cinttypes'], 'user_headers' => ['Dictionary.h', 'DictionaryManager.h', 'ColumnIteratorDict.h'], 'properties' => ['categorical'], 'extras' => ['cardinality' => $cardinality, 'size.bytes' => $storageBytes], 'binary_operators' => ['==', '!=', '<', '>', '<=', '>='], 'global_content' => $globalContent, 'complex' => 'ColumnIteratorDict< @type >', 'methods' => $methods, 'constructors' => $constructors, 'functions' => $functions, 'describe_json' => DescribeJson('factor', $describeInfoJson)]; }
public function __construct($hash, $name, $value, array $args, array $origArgs) { parent::__construct(InfoKind::T_TYPE, $hash, $name, $value, $args, $origArgs[0]); $iterator = 'ColumnIterator<' . $value . '>'; if (array_key_exists('complex', $args)) { $iter = $args['complex']; if ($iter !== false) { $iterator = str_replace('@type', $this->value(), $iter); } } $this->iterator = $iterator; if (array_key_exists('destroy', $args)) { $destroy = $args['destroy']; if (is_bool($destroy)) { $this->destroy = $destroy; } else { grokit_error("Expected boolean value for 'destroy' attribute of datatype " . $this->value()); } } if (array_key_exists('dictionary', $args)) { $this->dictionary = $args['dictionary']; } if (array_key_exists('fixed_size', $args)) { $this->fixedSize = $args['fixed_size']; } // Deal with operators defined by the type. if (array_key_exists('unary_operators', $args)) { foreach ($args['unary_operators'] as $op) { LibraryManager::AlphaOperator($op, 1, $this); } } if (array_key_exists('binary_operators', $args)) { foreach ($args['binary_operators'] as $op) { LibraryManager::AlphaOperator($op, 2, $this); } } if (array_key_exists('constructors', $args)) { foreach ($args['constructors'] as $cstr) { $cArgs = get_first_key($cstr, ['args', 0]); $cDet = get_first_key_default($cstr, ['deterministic', 1], true); $myType = $this->value(); $myName = end(explode('::', $myType)); $cName = get_first_key_default($cstr, ['c_name', 2], $myName); $callback = function ($args, $t_args = []) use($cArgs, $cDet, $myType, $cName) { $myArgs = array_map('lookupType', $cArgs); $myRet = lookupType($myType); $retVal = ['kind' => 'FUNCTION', 'input' => $myArgs, 'result' => $myRet, 'deterministic' => $cDet]; if ($cName !== null) { $retVal['name'] = $cName; } return $retVal; }; declareFunctionGlobal('', $myType, $cArgs, $callback); } } if (array_key_exists('functions', $args)) { foreach ($args['functions'] as $fnct) { $fName = get_first_key($fnct, ['name', 0]); $fArgs = get_first_key($fnct, ['args', 1]); $fRet = get_first_key($fnct, ['result', 2]); $fDet = get_first_key_default($fnct, ['deterministic', 3], false); $fGlobal = get_first_key_default($fnct, ['global', 4], false); $cName = get_first_key_default($fnct, ['c_name', 5], null); foreach ($fArgs as &$arg) { $arg = str_replace('@type', $this->value(), $arg); } $fRet = str_replace('@type', $this->value(), $fRet); $callback = function ($args, $t_args = []) use($fArgs, $fRet, $fDet, $cName, $fGlobal) { $myArgs = array_map('lookupType', $fArgs); $myRet = lookupType($fRet); $retVal = ['kind' => 'FUNCTION', 'input' => $myArgs, 'result' => $myRet, 'deterministic' => $fDet, 'global' => $fGlobal]; if ($cName !== null) { $retVal['name'] = $cName; } return $retVal; }; $fNS = $fGlobal ? 'BASE' : LibraryManager::ExtractNamespace($this->value()); $fName = LibraryManager::JoinNamespace($fNS, $fName); declareFunctionGlobal('', $fName, $fArgs, $callback); } } if (array_key_exists('methods', $args)) { foreach ($args['methods'] as $method) { $mName = get_first_key($method, ['name', 0]); $mArgs = get_first_key($method, ['args', 1]); $mRet = get_first_key($method, ['result', 2]); $mDet = get_first_key_default($method, ['deterministic', 3], false); if ($mRet == '@type') { $mRet = $this->value(); } foreach ($mArgs as &$mArg) { if ($mArg == '@type') { $mArg = $this->value(); } } //fwrite(STDERR, 'Defining method ' . $this->value() . '->' . $mName . //'(' . implode(', ', $mArgs ) . ') : ' . $mRet . PHP_EOL ); $info = new MethodInfo($mName, $mArgs, $mRet, $mDet); if (!array_key_exists($mName, $this->methods)) { $this->methods[$mName] = [$info]; } else { foreach ($this->methods[$mName] as $cm) { if ($info == $cm) { grokit_error('Attempting to multiply define method ' . $this->value() . '->' . $mName . '(' . implode(', ', $mArgs) . ')'); } } $this->methods[$mName][] = $info; } } } if (array_key_exists('describe_json', $args)) { $this->describers['json'] = $args['describe_json']; } }
function CSVReader(array $t_args, array $output) { $my_output = []; // Handle case where outputs are given as template arguments // and not implied. if (\count($output) == 0) { grokit_assert(array_key_exists('output', $t_args), 'Did not receive any description of my output!'); $output_list = $t_args['output']; grokit_assert(is_array($output_list), 'Expected list of types for template argument "output"'); $i = 1; foreach ($outputs_list as $name => $out_type) { grokit_assert(is_datatype($out_type) || is_identifier($out_type), 'Expected only types in the "output" list'); if (is_identifier($out_type)) { $out_type = lookupType($out_type->value()); } $name = 'val_' . $i; $my_output[$name] = $out_type; $i += 1; } } else { foreach ($output as $key => $out) { $name = $key; $my_output[$name] = $out; } } $debug = get_default($t_args, 'debug', 0); $simple = get_default($t_args, 'simple', false); $trimCR = get_default($t_args, 'trim.cr', false); // Handle separator $separator = ','; if (array_key_exists('sep', $t_args) || array_key_exists('separator', $t_args)) { $sep = get_first_key($t_args, ['sep', 'separator']); grokit_assert(is_string($sep), "Got " . gettype($sep) . " instead of string for separator."); if (strtolower($sep) === 'tab') { $sep = '\\t'; } grokit_assert($sep != "\n", 'CSV column delimiter cannot be new line'); // Scream if separator is longer than one character grokit_assert(\strlen($sep) == 1 || $sep == '\\t', 'Expected string of length 1 for separator, got string <' . $sep . '> instead'); $separator = $sep; } // Handle quote character $quotechar = '"'; if (array_key_exists('quote', $t_args) && !is_null($t_args['quote'])) { grokit_assert(!$simple, 'Quote option not available for simple CSVReader'); $quote = $t_args['quote']; grokit_assert(is_string($quote), "Got " . gettype($quote) . " instead of string for quote."); // Scream if separator is longer than one character grokit_assert(\strlen($quote) == 1, 'Expected string of length 1 for quote character, got string <' . $quote . '> instead'); $quotechar = $quote; } $quotechar = addcslashes($quotechar, '\\\''); // Handle escape character $escapeChar = '\\'; if (array_key_exists('escape', $t_args) && !is_null($t_args['escape'])) { grokit_assert(!$simple, 'Escape option not available for simple CSVReader'); $escape = $t_args['escape']; grokit_assert(is_string($escape), 'Got ' . gettype($escape) . ' instead of string for escape character.'); grokit_assert(\strlen($escape) == 1, 'Expected string of length 1 for escape character, got string <' . $escape . '> instead'); $escapeChar = $escape; } $escapeChar = addcslashes($escapeChar, '\\\''); // Handle header lines $headerLines = 0; if (array_key_exists('skip', $t_args)) { $headerLines = $t_args['skip']; grokit_assert(is_int($headerLines), 'Got ' . gettype($headerLines) . ' instead of int for number of lines to skip.'); grokit_assert($headerLines >= 0, 'Cannot skip a negative number of lines.'); } // Maximum number of lines to read $maxLines = get_default($t_args, 'n', -1); grokit_assert(is_int($maxLines), 'Got ' . gettype($maxLines) . ' instead of int for template argument "n"'); $nullArg = get_first_key_default($t_args, ['nullable'], false); $nullable = []; $nullStr = []; foreach ($my_output as $name => $type) { $nullable[$name] = false; } if ($nullArg === true) { foreach ($my_output as $name => $type) { $nullable[$name] = true; $nullStr[$name] = 'NULL'; } } else { if (is_array($nullArg)) { foreach ($nullArg as $n => $v) { // If nullable value is an associative mapping, the value is either true/false // or the value of the null string if (is_string($n)) { grokit_assert(is_string($v) || is_bool($v), 'CSVReader: nullable associative mapping must have string or boolean values'); grokit_assert(array_key_exists($n, $nullable), 'CSVReader: cannot make unknown attribute ' . $n . ' nullable'); if (is_bool($v)) { $nullable[$n] = $v; $nullStr[$n] = 'NULL'; } else { $nullable[$n] = true; $nullStr[$n] = $v; } } else { if (is_array($v)) { grokit_assert(array_key_exists('attr', $v), 'CSVReader: Name of nullable attribute not specified'); $attrName = $v['attr']->name(); $nullable[$attrName] = true; $nullStr[$attrName] = array_key_exists('null', $v) ? $v['null'] : 'NULL'; } else { // Otherwise, it's just nullable $attrName = $v->name(); grokit_assert(array_key_exists($attrName, $nullable), 'CSVReader: cannot make unknown attribute ' . $v . ' nullable'); $nullable[$attrName] = true; $nullStr[$attrName] = 'NULL'; } } } } else { if ($nullArg === false) { // Nothing } else { if (is_string($nullArg)) { foreach ($my_output as $name => $type) { $nullable[$name] = true; $nullStr[$name] = $nullArg; } } else { grokit_error('Template argument "nullable" must be boolean or array, ' . typeof($nullArg) . ' given'); } } } } // Come up with a name for ourselves $className = generate_name('CSVReader'); if ($debug >= 2) { foreach ($my_output as $name => $type) { fwrite(STDERR, "CSVReader: {$name} is nullable: " . ($nullable[$name] ? 'true' : 'false') . PHP_EOL); } } ?> class <?php echo $className; ?> { std::istream& my_stream; std::string fileName; // Template parameters static constexpr size_t MAX_LINES = <?php echo $maxLines; ?> ; static constexpr size_t HEADER_LINES = <?php echo $headerLines; ?> ; static constexpr char DELIMITER = '<?php echo $separator; ?> '; <?php if (!$simple) { ?> static constexpr char QUOTE_CHAR = '<?php echo $quotechar; ?> '; static constexpr char ESCAPE_CHAR = '<?php echo $escapeChar; ?> '; typedef boost::escaped_list_separator<char> separator; typedef boost::tokenizer< separator > Tokenizer; separator my_separator; Tokenizer my_tokenizer; <?php } ?> // Prevent having to allocate this every time. std::string line; std::vector<std::string> tokens; size_t count; <?php \grokit\declareDictionaries($my_output); ?> public: <?php echo $className; ?> ( GIStreamProxy& _stream ) : my_stream(_stream.get_stream()) , fileName(_stream.get_file_name()) <?php if (!$simple) { ?> , my_separator(ESCAPE_CHAR, DELIMITER, QUOTE_CHAR) , my_tokenizer(std::string("")) <?php } ?> , count(0) { <?php if ($headerLines > 0) { ?> for( size_t i = 0; i < HEADER_LINES; ++i ) { FATALIF( !getline( my_stream, line ), "CSV Reader reached end of file before finishing header.\n" ); } <?php } // If headerLines > 0 ?> } // > bool ProduceTuple( <?php echo typed_ref_args($my_output); ?> ) { if (count < MAX_LINES) { //> count++; } else { return false; } if( getline( my_stream, line ) ) { <?php if ($trimCR) { ?> if( line.back() == '\r' ) { line.pop_back(); } <?php } // if trimCR if (!$simple) { if ($debug >= 1) { ?> try { <?php } // if debug >= 1 ?> my_tokenizer.assign( line, my_separator ); <?php if ($debug >= 1) { ?> } catch(...) { FATAL("CSVReader for file %s failed on line: %s", fileName.c_str(), line.c_str()); } <?php } // if debug >= 1 ?> Tokenizer::iterator it = my_tokenizer.begin(); <?php foreach ($my_output as $name => $type) { if ($nullable[$name]) { // nullable ?> <?php \grokit\fromStringNullable($name, $type, 'it->c_str()', true, $nullStr[$name]); ?> <?php } else { // not nullable ?> <?php echo \grokit\fromStringDict($name, $type, 'it->c_str()'); ?> ; <?php } // end nullable check ?> ++it; <?php } // foreach output } else { ?> for( char & c : line ) { if( c == DELIMITER ) c = '\0'; } const char * ptr = line.c_str(); <?php $first = true; foreach ($my_output as $name => $type) { if ($first) { $first = false; } else { ?> while( *(ptr++) != '\0' ) ; // Advance past next delimiter <?php } // not first output if ($nullable[$name]) { ?> <?php echo \grokit\fromStringNullable($name, $type, 'ptr', true, $nullStr[$name]); } else { // not nullable ?> <?php echo \grokit\fromStringDict($name, $type, 'ptr'); ?> ; <?php } // if nullable } // foreach output } // if simple reader ?> return true; } else { return false; } } <?php \grokit\declareDictionaryGetters($my_output); ?> }; <?php $sys_headers = ['vector', 'string', 'iostream', 'cstdint']; if (!$simple) { $sys_headers[] = 'boost/tokenizer.hpp'; } return ['name' => $className, 'kind' => 'GI', 'output' => $my_output, 'system_headers' => $sys_headers, 'user_headers' => ['GIStreamInfo.h', 'Dictionary.h', 'DictionaryManager.h']]; }