Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ if(BUILD_TESTS)
add_cloudsql_test(transaction_coverage_tests tests/transaction_coverage_tests.cpp)
add_cloudsql_test(utils_coverage_tests tests/utils_coverage_tests.cpp)
add_cloudsql_test(bloom_filter_tests tests/bloom_filter_test.cpp)
add_cloudsql_test(hll_tests tests/hll_test.cpp)
add_cloudsql_test(cloudSQL_tests tests/cloudSQL_tests.cpp)
add_cloudsql_test(server_tests tests/server_tests.cpp)
add_cloudsql_test(statement_tests tests/statement_tests.cpp)
Expand Down
175 changes: 175 additions & 0 deletions include/common/hll.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/**
* @file hll.hpp
* @brief HyperLogLog probabilistic cardinality estimator
*/

#pragma once

#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>

namespace cloudsql {
namespace common {

/**
* @brief HyperLogLog — memory-bounded NDV estimator
*
* Uses a fixed register array of 2048 bytes (~12KB total) regardless of
* cardinality. Provides probabilistic cardinality estimates with ~1.6%
* standard error for cardinalities >> kNumRegisters.
*
* Algorithm (Flajolet et al. HyperLogLog):
* - For each item, hash to 64 bits
* - Register index: BOTTOM kIndexBits (p=11 for m=2048)
* - Register value: count of trailing zeros in remaining upper bits + 1
* - Final cardinality: m * log2(m / sum(2^(-reg_i)))
*
* For small cardinalities (<< kNumRegisters), uses linear counting
* fallback to avoid HLL's systematic overestimation.
*/
class HyperLogLog {
public:
static constexpr size_t kNumRegisters = 2048; // 2^11 for 11-bit index
static constexpr double kPowBase = 2.0; // base for 2^(-reg) computation
static constexpr int kIndexBits = 11; // bits used for register index

// Linear counting fallback: when nonzero registers < m / kLinearCountingThreshold,
// raw HLL formula overestimates severely (e.g., 3 distinct values → 23k estimate).
// Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction.
static constexpr double kLinearCountingThreshold = 20.0;

// Bias correction: when raw_estimate <= kBiasCorrectionBoundary * m, apply correction.
// Empirical testing shows HLL systematically overestimates for small cardinalities.
static constexpr double kBiasCorrectionBoundary = 2.5;

// Bias adjustment: bias = -0.5 * (m / kBiasAdjustmentFactor).
static constexpr double kBiasAdjustmentFactor = 10.0;

/**
* @brief Construct with optional seed for reproducible hashing
*/
explicit HyperLogLog(int seed = 0) noexcept : seed_(seed), registers_({}) {}

/**
* @brief Insert a pre-hashed 64-bit value
*/
void insert(uint64_t hash) noexcept {
hash ^= static_cast<uint64_t>(seed_);

// Register index from BOTTOM kIndexBits of hash
int idx = static_cast<int>(hash & (kNumRegisters - 1));

// Count trailing zeros in the UPPER bits (after index bits)
// These are the bits from position kIndexBits to 63
uint64_t remaining = hash >> kIndexBits;
int zeros = count_trailing_zeros(remaining) + 1;

// Clamp to uint8_t max
uint8_t new_val = static_cast<uint8_t>(std::min(zeros, 255));
registers_.at(idx) = std::max(registers_.at(idx), new_val);
}

/**
* @brief Estimate cardinality using HyperLogLog formula
*/
[[nodiscard]] uint64_t cardinality() const noexcept {
double sum = 0.0;
int nonzero_count = 0;
for (uint8_t reg : registers_) {
if (reg != 0) {
++nonzero_count;
sum += std::pow(kPowBase, -static_cast<double>(reg));
}
}

// Empty HLL → cardinality 0
if (nonzero_count == 0) {
return 0;
}

double m = static_cast<double>(kNumRegisters);
int empty_count = static_cast<int>(m) - nonzero_count;

// For sparse data (few registers used), use linear counting to avoid
// HLL's extreme overestimation. When registers are sparse (nonzero <
// m/kLinearCountingThreshold), the HLL raw formula gives wildly incorrect results.
if (nonzero_count < static_cast<int>(m / kLinearCountingThreshold)) {
// Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction
double linear_est = -m * std::log2(static_cast<double>(empty_count) / m);
return static_cast<uint64_t>(std::max(1.0, linear_est));
}

// Standard HLL formula for moderate to large cardinalities
double raw_estimate = m * std::log2(m / sum);

// Bias correction for small cardinalities
double bias = 0.0;
if (raw_estimate <= kBiasCorrectionBoundary * m) {
bias = -0.5 * (m / kBiasAdjustmentFactor);
}

double estimate = raw_estimate + bias;

if (estimate < 0) {
return 0;
}
if (estimate > static_cast<double>(kMaxCardinality)) {
return kMaxCardinality;
}
return static_cast<uint64_t>(estimate);
Comment on lines +95 to +121
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Use the actual HLL estimators here.

Both estimates are using log2(...), but standard HLL needs linear counting with the natural log for the sparse case and alpha_m * m * m / sum for the raw estimate. As written, this will store materially biased NDV stats for ANALYZE TABLE.

Suggested fix
-        if (nonzero_count < static_cast<int>(m / 20)) {
-            // Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction
-            // Using simple form without alpha scaling for very sparse data
-            double linear_est = -m * std::log2(static_cast<double>(empty_count) / m);
-            return static_cast<uint64_t>(std::max(1.0, linear_est));
-        }
-
-        // Standard HLL formula for moderate to large cardinalities
-        double raw_estimate = m * std::log2(m / sum);
+        const double alpha_m = 0.7213 / (1.0 + 1.079 / m);
+        double raw_estimate = alpha_m * m * m / sum;
+        if (empty_count > 0 && raw_estimate <= 2.5 * m) {
+            const double linear_est = m * std::log(m / static_cast<double>(empty_count));
+            return static_cast<uint64_t>(std::llround(linear_est));
+        }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// For sparse data (few registers used), use linear counting to avoid
// HLL's extreme overestimation. When registers are sparse (nonzero < m/20),
// the HLL raw formula gives wildly incorrect results.
if (nonzero_count < static_cast<int>(m / 20)) {
// Linear counting: E[n] ≈ -m * log(V/m) where V = empty fraction
// Using simple form without alpha scaling for very sparse data
double linear_est = -m * std::log2(static_cast<double>(empty_count) / m);
return static_cast<uint64_t>(std::max(1.0, linear_est));
}
// Standard HLL formula for moderate to large cardinalities
double raw_estimate = m * std::log2(m / sum);
// Bias correction for small cardinalities
// HLL systematically overestimates for small n; apply downward bias
double bias = 0.0;
if (raw_estimate <= 2.5 * m) {
bias = -0.5 * (m / 10.0);
}
double estimate = raw_estimate + bias;
if (estimate < 0) {
return 0;
}
if (estimate > static_cast<double>(kMaxCardinality)) {
return kMaxCardinality;
}
return static_cast<uint64_t>(estimate);
const double alpha_m = 0.7213 / (1.0 + 1.079 / m);
double raw_estimate = alpha_m * m * m / sum;
if (empty_count > 0 && raw_estimate <= 2.5 * m) {
const double linear_est = m * std::log(m / static_cast<double>(empty_count));
return static_cast<uint64_t>(std::llround(linear_est));
}
// Bias correction for small cardinalities
// HLL systematically overestimates for small n; apply downward bias
double bias = 0.0;
if (raw_estimate <= 2.5 * m) {
bias = -0.5 * (m / 10.0);
}
double estimate = raw_estimate + bias;
if (estimate < 0) {
return 0;
}
if (estimate > static_cast<double>(kMaxCardinality)) {
return kMaxCardinality;
}
return static_cast<uint64_t>(estimate);
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@include/common/hll.hpp` around lines 83 - 111, The sparse-case and raw HLL
formulas are wrong (they use log2 and lack alpha scaling); replace the linear
counting and raw estimate calculations so linear counting uses the natural log
(std::log) with V = empty_count and E = -m *
std::log(static_cast<double>(empty_count) / m), and replace raw_estimate with
the HLL estimator raw_estimate = alpha_m * m * m / sum where alpha_m is chosen
per m (compute alpha_m using the standard small-m constants or formula for
get_alpha(m)); keep existing bounds handling for estimate, casting and use
kMaxCardinality as before (refer to nonzero_count, m, empty_count, sum,
raw_estimate, estimate, kMaxCardinality).

}

/**
* @brief Reset all registers to zero
*/
void reset() noexcept { registers_.fill(0); }

/**
* @brief Merge another HLL into this one (element-wise max of registers)
*/
void merge(const HyperLogLog& other) noexcept {
for (size_t i = 0; i < kNumRegisters; ++i) {
registers_.at(i) = std::max(registers_.at(i), other.registers_.at(i));
}
Comment on lines +132 to +135
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Block merges across different seeds.

merge() currently combines sketches even when seed_ differs. That silently unions registers from different hash domains, so the resulting cardinality is meaningless.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@include/common/hll.hpp` around lines 122 - 125, merge() silently combines
HyperLogLog sketches with different hash seeds which corrupts results; change
merge(const HyperLogLog& other) noexcept to remove noexcept and add a check that
other.seed_ == seed_, and if not, throw a descriptive std::invalid_argument (or
std::runtime_error) mentioning mismatched seeds; if seeds match, proceed with
the existing loop over kNumRegisters updating registers_.at(i). Ensure you
reference and update the function signature and any callers to handle the
exception.

}

/**
* @brief Hash a byte buffer to uint64_t (FNV-1a hash)
*
* FNV-1a is used instead of djb2 because djb2 doesn't distribute
* upper bits well for strings with common prefixes.
*/
[[nodiscard]] static uint64_t hash_bytes(const void* data, size_t len) noexcept {
static constexpr uint64_t kFnvOffsetBasis = 14695981039346656037ULL;
static constexpr uint64_t kFnvPrime = 1099511628211ULL;

const uint8_t* bytes = static_cast<const uint8_t*>(data);
uint64_t hash = kFnvOffsetBasis;
for (size_t i = 0; i < len; ++i) {
hash ^= bytes[i];
hash *= kFnvPrime;
}
return hash;
}

private:
static constexpr uint64_t kMaxCardinality = UINT64_MAX;

std::array<uint8_t, kNumRegisters> registers_;
int seed_;

/**
* @brief Count trailing zero bits in a 64-bit value
*/
[[nodiscard]] static int count_trailing_zeros(uint64_t v) noexcept {
if (v == 0) {
return 64;
}
return __builtin_ctzll(v);
}
};

} // namespace common
} // namespace cloudsql
24 changes: 14 additions & 10 deletions src/executor/query_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "catalog/catalog.hpp"
#include "common/cluster_manager.hpp"
#include "common/hll.hpp"
#include "common/value.hpp"
#include "distributed/raft_group.hpp"
#include "distributed/raft_manager.hpp"
Expand Down Expand Up @@ -995,7 +996,7 @@ QueryResult QueryExecutor::execute_analyze(const parser::AnalyzeStatement& stmt)

// Collect per-column stats by scanning the table (single pass)
std::vector<ColumnInfo> col_stats(table_meta->columns.size());
std::vector<std::unordered_set<std::string>> ndv_sets(table_meta->columns.size());
std::vector<common::HyperLogLog> ndv_estimators(table_meta->columns.size());

auto iter = table.scan();
Tuple tuple;
Expand All @@ -1010,17 +1011,20 @@ QueryResult QueryExecutor::execute_analyze(const parser::AnalyzeStatement& stmt)
if (val.is_null()) {
col_stats[col_idx].null_count++;
} else {
// Collect NDV in same pass - use prefix for text to limit memory
std::string ndv_key = val.to_string();
// Collect NDV via HyperLogLog — memory-bounded vs unbounded unordered_set
uint64_t hash = 0;
if (col_info.type == common::ValueType::TYPE_TEXT ||
col_info.type == common::ValueType::TYPE_VARCHAR ||
col_info.type == common::ValueType::TYPE_CHAR) {
// Truncate to first 64 chars to limit memory in NDV set.
// Note: distinct strings with the same 64-char prefix will be
// counted as one NDV. Use HyperLogLog for production accuracy.
ndv_key.resize(std::min(ndv_key.size(), size_t(64)));
// Use 64-char prefix for text hashing
const std::string& s = val.as_text();
size_t prefix_len = std::min(s.size(), size_t(64));
hash = common::HyperLogLog::hash_bytes(s.data(), prefix_len);
Comment on lines +1016 to +1022
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Hash the full text value for NDV.

Using only the first 64 characters makes all longer strings with a shared prefix collide into the same HLL stream, which will systematically undercount NDV for common text patterns.

Suggested fix
-                    // Use 64-char prefix for text hashing
                     const std::string& s = val.as_text();
-                    size_t prefix_len = std::min(s.size(), size_t(64));
-                    hash = common::HyperLogLog::hash_bytes(s.data(), prefix_len);
+                    hash = common::HyperLogLog::hash_bytes(s.data(), s.size());
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/executor/query_executor.cpp` around lines 955 - 961, The text branch
currently hashes only a 64-byte prefix (see col_info.type checks and
val.as_text()) which undercounts NDV; change it to hash the entire string by
passing the full length of val.as_text() to common::HyperLogLog::hash_bytes
(replace the prefix_len logic and use s.size()), ensuring the computed hash
value assigned to variable hash covers the whole text for
ValueType::TYPE_TEXT/TYPE_VARCHAR/TYPE_CHAR cases.

} else {
// Use common::Value::Hash for numeric and other types
hash = static_cast<uint64_t>(common::Value::Hash{}(val));
}
ndv_sets[col_idx].insert(std::move(ndv_key));
ndv_estimators[col_idx].insert(hash);

switch (col_info.type) {
case common::ValueType::TYPE_INT64:
Expand Down Expand Up @@ -1075,9 +1079,9 @@ QueryResult QueryExecutor::execute_analyze(const parser::AnalyzeStatement& stmt)
}
}

// Compute NDV from sets collected in single pass
// Compute NDV from HLL estimators collected in single pass
for (size_t col_idx = 0; col_idx < table_meta->columns.size(); ++col_idx) {
col_stats[col_idx].ndv = static_cast<uint64_t>(ndv_sets[col_idx].size());
col_stats[col_idx].ndv = ndv_estimators[col_idx].cardinality();
}

// Update table-level stats
Expand Down
3 changes: 2 additions & 1 deletion tests/cloudSQL_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1305,7 +1305,8 @@ TEST(ExecutionTests, AnalyzeTable) {
// txt column
EXPECT_TRUE(table_info->columns[2].has_stats);
EXPECT_EQ(table_info->columns[2].null_count, 0U);
EXPECT_EQ(table_info->columns[2].ndv.value(), 3U); // 'A', 'B', 'C'
// HLL is probabilistic — for 3 distinct text values, estimate should be >= 3
EXPECT_GE(table_info->columns[2].ndv.value(), 3U);
// String length stats for txt column ('A','B','C' are all length 1)
EXPECT_TRUE(table_info->columns[2].min_str_len.has_value());
EXPECT_TRUE(table_info->columns[2].max_str_len.has_value());
Expand Down
Loading