/** * @brief Collects all participating nodes from the SQL tree for a given table name or alias * @param sqlTree the SQL tree which is rewritten * @param tblDb database name * @param tblName table name * @param tblAlias table alias name * @param returnArray return all the found columns into this array * @param tableList the sorted table list * @param recLevel current recursion level * @param startBrach true if this is the first call to this function, false if it is in a recursion * @return returns the number of columns that lie in a different table than the one given, -1 if none are found * * Counts all the columns corresponding to a given table name or alias. The columns that correspond to the table are placed * into the returnArray. The function returns the number of columns that are different to the specified table. */ function PHPSQLcollectColumns($sqlSelect, $tblDb, $tblName, $tblAlias, &$returnArray, &$tableList, $recLevel, $startBranch = true) { $countDiffTables = 0; $workload = array(); if (array_key_exists('SELECT', $sqlSelect)) { $workload = $sqlSelect['SELECT']; } else { $workload = $sqlSelect; } foreach ($workload as $node) { if (!isColref($node) && !isOperator($node) && !isReserved($node)) { $currCountDiffTables = false; $currColArray = array(); if (hasSubtree($node)) { $currCountDiffTables = PHPSQLcollectColumns($node['sub_tree'], $tblDb, $tblName, $tblAlias, $currColArray, $tableList, $recLevel, false); $countDiffTables += $currCountDiffTables; } else { if ($recLevel === 0) { //if this has no subtree (as do functions without parameters like foo()), then add it at the outermost possible //place (i.e. recLeve = 0) $currCountDiffTables = 0; } } if ($startBranch === true && $currCountDiffTables === 0) { if (!hasAlias($node)) { $node['alias'] = createAliasNode(array(buildEscapedString(array($node)))); } //add the correct table names to all columns that are not in the current column if (hasSubtree($node) && count($tableList) > 0 && $recLevel !== max(array_keys($tableList))) { $toThisTableName = extractTableAlias($tableList[$recLevel + 1]['node']); if ($toThisTableName === false) { $toThisTableName = extractTableName($tableList[$recLevel + 1]['node']); } if (hasAlias($tableList[$recLevel])) { rewriteTableNameInSubqueries($node['sub_tree'], $toThisTableName, extractTableAlias($tableList[$recLevel]['node'])); } else { rewriteTableNameInSubqueries($node['sub_tree'], $toThisTableName, extractTableName($tableList[$recLevel]['node'])); } if ($node['expr_type'] !== "aggregate_function") { $node['base_expr'] = getBaseExpr($node); } } //if this is a function/aggregate that applies to a '*' "column", then only apply this at the //outermost level (i.e. reclevel=0) and not before (see Test40 for case where this applies) $canAddThisNode = true; foreach ($currColArray as $column) { if ($column['base_expr'] === "*" && $recLevel != 0) { $canAddThisNode = false; break; } } if ($canAddThisNode === true) { array_push($returnArray, $node); } } //add columns if not yet added, but only if there are dependant columns evolved if ($currCountDiffTables !== false && $currCountDiffTables > 0) { foreach ($currColArray as $column) { $found = false; //go through the already selected columns foreach ($returnArray as $returnColumn) { if (columnIsEqual($returnColumn, $column)) { $found = true; break; } } if ($found === false) { $column['alias'] = createAliasNode(array(implodeNoQuotes($column['no_quotes']))); array_push($returnArray, $column); } } } } else { if (isColref($node) || $node['base_expr'] === '*' && $startBranch === true) { $currCol = extractColumnName($node); $currTable = extractTableName($node); $currDB = extractDbName($node); if ($currTable === $tblAlias || $currTable === false || $currTable === $tblName || $tblAlias === $currDB . "." . $currTable) { array_push($returnArray, $node); } else { #check if this table has already been selected and processed. if yes, we can already process it and #donot need to wait $found = false; foreach ($tableList as $key => $tblListNode) { #skip everything not yet processed... if ($key < $recLevel) { continue; } if ($currTable === extractTableAlias($tblListNode['node'])) { $found = true; break; } } if ($found === false) { $countDiffTables += 1; } } } else { if (isReserved($node)) { //always adding reserved keywords to each select statement we issue array_push($returnArray, $node); } } } } return $countDiffTables; }
function process_select($select, $recLevel, $straight_join = false, $whereSubquery = false) { $error = array(); $shard_query = ""; #Query to send to each shard $coord_query = ""; #Query to send to the coordination node $avg_count = 0; $distinct = false; $group = array(); #list of positions which contain non-aggregate functions $push_group = array(); #this is necessary for non-distributable aggregate functions $group_aliases = array(); #list of group aliases which will be indexed on the aggregation temp table $is_aggregate = false; $coord_odku = array(); //check for distinct keyword (should be the first in the list) if (isReserved($select[0]) && strtolower($select[0]['base_expr']) === "distinct") { $distinct = true; unset($select[0]); $select = array_values($select); } if ($distinct === true) { $shard_query .= "DISTINCT "; $coord_query .= "DISTINCT "; } $used_agg_func = 0; foreach ($select as $pos => $clause) { if ($shard_query && $pos != 0) { $shard_query .= ","; } if ($coord_query && $pos != 0) { $coord_query .= ","; } /*if (!empty($clause['base_expr']) && $clause['base_expr'] == "*") { $error[] = array('error_clause' => '*', 'error_reason' => '"SELECT *" is not supported'); continue; }*/ if (isset($clause['alias']['name']) && !empty($clause['alias']['name'])) { $alias = "`" . trim($clause['alias']['name'], "`") . "`"; } else { $alias = "`" . $clause['base_expr'] . "`"; } if (strpos($alias, '.')) { $alias = trim($alias, '``'); $tmp = explode('.', $alias); if (count($tmp) > 2) { #this is a more complicated expression, use the alias as it is (might be a formula or something) $alias = "`" . substr(trim($clause['alias']['name'], "`"), 0, 50) . "`"; } else { $alias = $tmp[0] . "." . $tmp[1]; $alias = "`{$alias}`"; } } //further escape any function name that might end up in the alias //the new parser does not add anything helpfull to equations - so we need to look for stuff //without any expr_type if (isset($clause['expr_type']) && ($clause['expr_type'] === "aggregate_function" || $clause['expr_type'] === "function" || $clause['expr_type'] === "bracket_expression")) { //build escaped string if (trim($alias, "`") === $clause['base_expr'] || $alias === "``") { $alias = buildEscapedString(array($clause)); $alias = "`" . $alias . "`"; } } $base_expr = $clause['base_expr']; if (!isset($clause['expr_type'])) { continue; } switch ($clause['expr_type']) { case 'aggregate_function': $is_aggregate = true; $skip_next = true; if (strpos($base_expr, "(") === false) { if (!empty($clause['sub_tree'])) { $base_expr = ""; foreach ($clause['sub_tree'] as $node) { $base_expr .= getBaseExpr($node); } } else { $base_expr = ""; } } else { $base_expr = $clause['base_expr']; } $alias = "`" . substr(trim($clause['alias']['name'], "`"), 0, 50) . "`"; $function = strtoupper($clause['base_expr']); switch ($function) { #these are aggregates that dont need special treatment on the coordination side case 'MIN': case 'MAX': case 'SUM': $used_agg_func = 1; $base_expr = trim($base_expr, ' ()'); $base_expr = fix_trunc_parenth($base_expr); $expr_info = explode(" ", $base_expr); if (!empty($expr_info[0]) && strtolower($expr_info[0]) == 'distinct') { if ($this->verbose) { echo "Detected a {$function} [DISTINCT] expression!\n"; } unset($expr_info[0]); $new_expr = join(" ", $expr_info); $shard_query .= "{$new_expr} AS {$alias}"; $coord_query .= "{$function}(distinct {$alias}) as {$alias}"; $push_group[] = $pos + 1; } else { switch ($function) { case 'SUM': $coord_odku[] = "{$alias}={$alias} + VALUES({$alias})"; break; case 'MIN': $coord_odku[] = "{$alias}=IF({$alias} < VALUES({$alias}), VALUES({$alias}),{$alias})"; break; case 'MAX': $coord_odku[] = "{$alias}=IF({$alias} > VALUES({$alias}), VALUES({$alias}), {$alias})"; break; } $shard_query .= "{$function}({$base_expr}) AS {$alias}"; $coord_query .= "{$function}({$alias}) AS {$alias}"; } break; #special treatment needed #special treatment needed case 'AVG': case 'STDDEV': case 'STD': case 'STDDEV_POP': case 'STDDEV_SAMP': case 'VARIANCE': case 'VAR_POP': case 'VAR_SAMP': case 'GROUP_CONCAT': $used_agg_func = 1; $base_expr = trim($base_expr, ' ()'); $base_expr = fix_trunc_parenth($base_expr); $expr_info = explode(" ", $base_expr); if (!empty($expr_info[0]) && strtolower($expr_info[0]) == 'distinct') { if ($this->verbose) { echo "Detected a {$function} [DISTINCT] expression!\n"; } unset($expr_info[0]); $new_expr = join(" ", $expr_info); $shard_query .= "{$new_expr} AS {$alias}"; $coord_query .= "{$function}(distinct {$alias}) as {$alias}"; } else { switch ($function) { case 'AVG': $alias = trim($alias, '`'); $shard_query .= " COUNT({$base_expr}) AS `cnt_{$alias}`, SUM({$base_expr}) AS `sum_{$alias}`"; $coord_query .= " (SUM(`sum_{$alias}`) / SUM(`cnt_{$alias}`)) AS `{$alias}`"; break; case 'STDDEV': case 'STD': case 'STDDEV_POP': $alias = trim($alias, '`'); $avgAlias = '`agr_' . $alias . '`'; $shard_query .= " sum_of_squares({$base_expr}) AS `ssqr_{$alias}`, AVG({$base_expr}) as `avg_{$alias}`, COUNT({$base_expr}) AS `cnt_{$alias}`"; $coord_query .= " SQRT(partitAdd_sum_of_squares(`ssqr_{$alias}`, `avg_{$alias}`, `cnt_{$alias}`) / (SUM(`cnt_{$alias}`) - 1)) AS `{$alias}`"; break; case 'STDDEV_SAMP': $alias = trim($alias, '`'); $avgAlias = '`agr_' . $alias . '`'; $shard_query .= " sum_of_squares({$base_expr}) AS `ssqr_{$alias}`, AVG({$base_expr}) as `avg_{$alias}`, COUNT({$base_expr}) AS `cnt_{$alias}`"; $coord_query .= " SQRT(partitAdd_sum_of_squares(`ssqr_{$alias}`, `avg_{$alias}`, `cnt_{$alias}`) / SUM(`cnt_{$alias}`)) AS `{$alias}`"; break; case 'VARIANCE': case 'VAR_POP': $alias = trim($alias, '`'); $avgAlias = '`agr_' . $alias . '`'; $shard_query .= " sum_of_squares({$base_expr}) AS `ssqr_{$alias}`, AVG({$base_expr}) as `avg_{$alias}`, COUNT({$base_expr}) AS `cnt_{$alias}`"; $coord_query .= " partitAdd_sum_of_squares(`ssqr_{$alias}`, `avg_{$alias}`, `cnt_{$alias}`) / (SUM(`cnt_{$alias}`) - 1) AS `{$alias}`"; break; case 'VAR_SAMP': $alias = trim($alias, '`'); $avgAlias = '`agr_' . $alias . '`'; $shard_query .= " sum_of_squares({$base_expr}) AS `ssqr_{$alias}`, AVG({$base_expr}) as `avg_{$alias}`, COUNT({$base_expr}) AS `cnt_{$alias}`"; $coord_query .= " partitAdd_sum_of_squares(`ssqr_{$alias}`, `avg_{$alias}`, `cnt_{$alias}`) / (SUM(`cnt_{$alias}`) - 1) AS `{$alias}`"; break; default: $shard_query .= "{$function}({$base_expr}) AS {$alias}"; $coord_query .= "{$function}({$alias}) AS {$alias}"; break; } } $push_group[] = $pos + 1; $group_aliases[] = $alias; break; case 'COUNT': $used_agg_func = 1; $base_expr = trim($base_expr, ' ()'); $base_expr = fix_trunc_parenth($base_expr); $expr_info = explode(" ", $base_expr); if (!empty($expr_info[0]) && strtolower($expr_info[0]) == 'distinct') { if ($this->verbose) { echo "Detected a COUNT [DISTINCT] expression!\n"; } unset($expr_info[0]); $new_expr = join(" ", $expr_info); $shard_query .= "{$new_expr} AS {$alias}"; $coord_query .= "COUNT(distinct {$alias}) as {$alias}"; $push_group[] = $pos + 1; } else { $shard_query .= "COUNT({$base_expr}) AS {$alias}"; $coord_query .= "SUM({$alias}) AS {$alias}"; $coord_odku[] = "{$alias}={$alias} + VALUES({$alias})"; } break; default: $error[] = array('error_clause' => $clause['base_expr'], 'error_reason' => 'Unsupported aggregate function'); break; } break; case 'expression': case 'operator': case 'const': case 'colref': case 'bracket_expression': case 'reserved': case 'function': $group[] = $pos + 1; $group_aliases[] = $alias; //if this is a function without arguments, add the parenthesis if ($clause['expr_type'] === 'function' && strpos($base_expr, "(") === false) { $base_expr = getBaseExpr($clause); } //var_dump($alias); die(0); $shard_query .= $base_expr . ' AS ' . $alias; #if this is a temporary column used for grouping, don't select it in the coordination query #TODO: CHECK IF THIS IS STILLN NEEDED! /*if (!array_key_exists('group_clause', $clause) && $whereSubquery === false) { $coord_query .= $alias; $coord_odku[] = "$alias=VALUES($alias)"; }*/ #exclude certain aggregation expressions if (strpos($alias, 'agr_stddev') === false && !array_key_exists('group_clause', $clause)) { #if this is a temporary column from WHERE, don't select it in the coordination if (!array_key_exists('where_col', $clause) || $recLevel >= 0) { #don't select order by columns, that are implicit //if(!array_key_exists('order_clause', $clause)) { if ($whereSubquery === false) { $coord_query .= $alias; $coord_odku[] = "{$alias}=VALUES({$alias})"; } else { if (!array_key_exists('where_col', $clause) && !array_key_exists('order_clause', $clause)) { $coord_query .= $alias; $coord_odku[] = "{$alias}=VALUES({$alias})"; } else { $coord_query = substr($coord_query, 0, -1); } } } else { $coord_query = substr($coord_query, 0, -1); } } else { #remove the ',' that is too much in the query $coord_query = substr($coord_query, 0, -1); } break; default: $error[] = array('error_clause' => $clause['base_expr'], 'error_reason' => 'Unsupported expression type (did you forget an alias on an aggregate expression?)'); break; } } $sql = "SELECT "; if ($straight_join) { $sql .= "STRAIGHT_JOIN "; } $sql .= $shard_query; $shard_group = array(); #merge pushed and provided group-by if ($used_agg_func) { $shard_group = $group; foreach ($push_group as $push) { $shard_group[] = $push; } #they shouldn't conflict, but this ensures so $shard_group = array_unique($shard_group); } else { $group = array(); $shard_group = array(); } #we can't send pushed group by to the coord shard, so send the expression based return array('error' => $error, 'shard_sql' => $sql, 'coord_odku' => $coord_odku, 'coord_sql' => 'SELECT ' . $coord_query, 'shard_group' => join(',', $shard_group), 'coord_group' => join(',', $group), 'group_aliases' => join(',', $group_aliases)); }