diff --git a/mysql-test/suite/percona/include/distance.inc b/mysql-test/suite/percona/include/distance.inc new file mode 100644 index 000000000000..91c193975a4a --- /dev/null +++ b/mysql-test/suite/percona/include/distance.inc @@ -0,0 +1,237 @@ +--echo # +--echo # Test coverage for vector DISTANCE() function. +--echo # + +--echo # +--echo # 0) Prepare playground. +--echo # +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), + (1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), + (2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), + (3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), + (4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), + (98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), + (99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); + +--echo # +--echo # 1) Test how different number and types of arguments are handled. +--echo # +--echo # 1.1) Arity. +--echo # +--error ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT +SELECT DISTANCE(); +--error ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT +SELECT DISTANCE(TO_VECTOR("[1]")); +--error ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +eval SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "$metric"); +--error ER_WRONG_PARAMCOUNT_TO_NATIVE_FCT +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); + +--echo # +--echo # 1.2) Argument types. +--echo # +--echo # Only vectors or binary strings are allowed for first the two arguments. +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "$metric"); +eval SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "$metric"); +eval SELECT DISTANCE(v1, TO_VECTOR("[2]"), "$metric") FROM t1 WHERE id = 0; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(id, TO_VECTOR("[2]"), "$metric") FROM t1 WHERE id = 0; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0]"), v1, "$metric") FROM t1 WHERE id = 1; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0]"), id, "$metric") FROM t1 WHERE id = 1; + +--echo # The third argument must be a string literal with value from the +--echo # fixed list of metric names. +--error ER_WRONG_ARGUMENTS +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +--error ER_WRONG_ARGUMENTS +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +--echo # Metric strings with embedded NUL must be rejected regardless of prefix match. +--error ER_WRONG_ARGUMENTS +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +--error ER_WRONG_ARGUMENTS +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); + +--echo # +--echo # 1.3) NULL arguments and nullability in metadata for result. +--echo # +eval SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "$metric"); +eval SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "$metric") FROM t1 WHERE id = 99; +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "$metric") FROM t1 WHERE id = 99; +--echo # The third argument doesn't allow NULL values in any form. +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +--echo # The result metadata should indicate that it is nullable. +eval CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "$metric") AS d; +SHOW CREATE TABLE tt; +DROP TABLE tt; + +--echo # +--echo # 2) Test vector arguments length mismatch. +--echo # +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "$metric"); +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(v2, TO_VECTOR("[1]"), "$metric") FROM t1 WHERE id = 1; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(v1, v2, "$metric") FROM t1 WHERE id = 1; +--echo # +--echo # Note that length check happens at runtime. This is well visible +--echo # when we have value stored in a vector field which is shorter than +--echo # maximum length specified at the field creation time. +eval SELECT DISTANCE(v1, v2, "$metric") FROM t1 WHERE id = 98; +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "$metric") FROM t1 WHERE id = 98; +--echo # Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +--echo # A BLOB column is used so the argument passes the resolve-time binary-charset +--echo # type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +--error ER_WRONG_ARGUMENTS +eval SELECT DISTANCE(v, v, "$metric") FROM t_oversized; +DROP TABLE t_oversized; + +--echo # +--echo # 3) Some basic tests for different (from syntax PoV) variants of +--echo # arguments. +--echo # +eval SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "$metric"); +eval SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "$metric"); +eval SELECT DISTANCE(X'0000000000000000', v2, "$metric") FROM t1 WHERE id = 4; +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "$metric") FROM t1 WHERE id = 1; +eval SELECT DISTANCE(a.v2, b.v2, "$metric") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +eval SELECT DISTANCE(v2, X'0000000000000040', "$metric") FROM t1 WHERE id = 0; +eval SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "$metric") FROM t1 WHERE id = 0; +--echo # Non-trivial (artificial) combinations +eval SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "$metric"); +--echo # The below case demonstrates that arguments to DISTANCE might not be +--echo # well-aligned in memory. +eval SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "$metric"); +--echo # 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +--echo # Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +eval SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "$metric"); + +--echo # +--echo # 4) Basic test for different vector values. +--echo # +--echo # Identical / collinear vectors. +eval SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "$metric"); +--echo # Orthogonal vectors. +eval SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "$metric"); +--echo # Anti-parallel vectors. +eval SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "$metric"); +if ($metric != DOT) +{ + if ($metric != MANHATTAN) + { + if ($metric != COSINE) + { + --error ER_DATA_OUT_OF_RANGE + } + } +} +eval SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "$metric"); +--echo # Distance from origin. +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "$metric"); +--echo # Mixed-sign and larger vectors. +eval SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "$metric"); +--echo # Zero vector (behavior differs per metric). +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "$metric"); +eval SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "$metric"); +--echo # Large values near float32 max. +eval SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "$metric"); +--echo # Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +--echo # fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +--echo # float32 overflows to +Inf, but the isfinite check falls back to scalar). +eval SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), + TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), + "$metric"); +--echo # Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +eval SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "$metric") = + DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "$metric"); +--echo # Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +--echo # NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +--error ER_DATA_OUT_OF_RANGE +eval SELECT DISTANCE(X'0000C07F', X'00000000', "$metric"); +--error ER_DATA_OUT_OF_RANGE +eval SELECT DISTANCE(X'0000807F', X'00000000', "$metric"); +--error ER_DATA_OUT_OF_RANGE +eval SELECT DISTANCE(X'000080FF', X'00000000', "$metric"); +--echo # Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +--echo # Integer-valued diffs keep float32 partial sums exact, so results are +--echo # identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +--echo # 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +eval SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), + TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), + "$metric"); +--echo # 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +--echo # AVX-512 1x16 + 4-elem scalar tail. +eval SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), + TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), + "$metric"); + +--disable_warnings +--echo # +--echo # 5) Distance in query contexts. +--echo # +--echo # ORDER BY distance: nearest-neighbour pattern. +eval SELECT id FROM t1 WHERE id IN (0,1,2,3,4) + ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "$metric"), id; +--echo # ORDER BY distance DESC: farthest-neighbour pattern. +eval SELECT id FROM t1 WHERE id IN (0,1,2,3,4) + ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "$metric") DESC, id; +--echo # WHERE: range query filtering by distance. +eval SELECT id FROM t1 + WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "$metric") < 1.5 + ORDER BY id; +--echo # Derived table with distance. +eval SELECT id FROM + (SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "$metric") AS d + FROM t1 WHERE id IN (0,1,2,3,4)) AS sq + WHERE d IS NOT NULL ORDER BY d, id; +--enable_warnings + +if ($metric == COSINE) +{ +--echo # Zero-vector cosine in DML: must insert NULL without aborting under strict sql_mode. +CREATE TABLE tt_cosine_dml (d DOUBLE); +INSERT INTO tt_cosine_dml SELECT DISTANCE(TO_VECTOR('[0]'), TO_VECTOR('[0]'), 'COSINE'); +SELECT d FROM tt_cosine_dml; +DROP TABLE tt_cosine_dml; +} + +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/r/distance_cosine.result b/mysql-test/suite/percona/r/distance_cosine.result new file mode 100644 index 000000000000..03ae8bc4491d --- /dev/null +++ b/mysql-test/suite/percona/r/distance_cosine.result @@ -0,0 +1,344 @@ +# +# Test coverage for vector DISTANCE() function. +# +# +# 0) Prepare playground. +# +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), +(1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), +(2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), +(3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), +(4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), +(98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), +(99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); +# +# 1) Test how different number and types of arguments are handled. +# +# 1.1) Arity. +# +SELECT DISTANCE(); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "COSINE"); +DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +# +# 1.2) Argument types. +# +# Only vectors or binary strings are allowed for first the two arguments. +SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "COSINE"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "COSINE"); +DISTANCE(X'0000803F', TO_VECTOR("[2]"), "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "COSINE"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "COSINE") +NULL +SELECT DISTANCE(v1, TO_VECTOR("[2]"), "COSINE") FROM t1 WHERE id = 0; +DISTANCE(v1, TO_VECTOR("[2]"), "COSINE") +NULL +SELECT DISTANCE(id, TO_VECTOR("[2]"), "COSINE") FROM t1 WHERE id = 0; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "COSINE"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "COSINE"); +DISTANCE(TO_VECTOR("[2]"), X'0000803F', "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[0]"), v1, "COSINE") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0]"), v1, "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0]"), id, "COSINE") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# The third argument must be a string literal with value from the +# fixed list of metric names. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN") +3 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E') +4 +# Metric strings with embedded NUL must be rejected regardless of prefix match. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +# +# 1.3) NULL arguments and nullability in metadata for result. +# +SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "COSINE"); +DISTANCE(NULL, TO_VECTOR("[1, 0]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), NULL, "COSINE") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "COSINE") FROM t1 WHERE id = 99; +DISTANCE(v2, TO_VECTOR("[1, 0]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "COSINE") FROM t1 WHERE id = 99; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "COSINE") +NULL +# The third argument doesn't allow NULL values in any form. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +ERROR HY000: Incorrect arguments to distance +# The result metadata should indicate that it is nullable. +CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "COSINE") AS d; +SHOW CREATE TABLE tt; +Table Create Table +tt CREATE TABLE `tt` ( + `d` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci +DROP TABLE tt; +# +# 2) Test vector arguments length mismatch. +# +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "COSINE"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v2, TO_VECTOR("[1]"), "COSINE") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v1, v2, "COSINE") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# +# Note that length check happens at runtime. This is well visible +# when we have value stored in a vector field which is shorter than +# maximum length specified at the field creation time. +SELECT DISTANCE(v1, v2, "COSINE") FROM t1 WHERE id = 98; +DISTANCE(v1, v2, "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "COSINE") FROM t1 WHERE id = 98; +ERROR HY000: Incorrect arguments to distance +# Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +# A BLOB column is used so the argument passes the resolve-time binary-charset +# type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +SELECT DISTANCE(v, v, "COSINE") FROM t_oversized; +ERROR HY000: Incorrect arguments to distance +DROP TABLE t_oversized; +# +# 3) Some basic tests for different (from syntax PoV) variants of +# arguments. +# +SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "COSINE"); +DISTANCE(X'0000000000000000', X'0000000000000040', "COSINE") +NULL +SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "COSINE"); +DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "COSINE") +NULL +SELECT DISTANCE(X'0000000000000000', v2, "COSINE") FROM t1 WHERE id = 4; +DISTANCE(X'0000000000000000', v2, "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "COSINE") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "COSINE") +NULL +SELECT DISTANCE(a.v2, b.v2, "COSINE") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +DISTANCE(a.v2, b.v2, "COSINE") +NULL +SELECT DISTANCE(v2, X'0000000000000040', "COSINE") FROM t1 WHERE id = 0; +DISTANCE(v2, X'0000000000000040', "COSINE") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "COSINE") FROM t1 WHERE id = 0; +DISTANCE(v2, TO_VECTOR("[0, 2]"), "COSINE") +NULL +# Non-trivial (artificial) combinations +SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "COSINE"); +DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "COSINE") +0 +# The below case demonstrates that arguments to DISTANCE might not be +# well-aligned in memory. +SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "COSINE"); +DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "COSINE") +0 +# 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +# Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "COSINE"); +DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "COSINE") +0 +# +# 4) Basic test for different vector values. +# +# Identical / collinear vectors. +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "COSINE") +0 +SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "COSINE") +0 +# Orthogonal vectors. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "COSINE") +1 +SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "COSINE") +1 +SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "COSINE") +1 +# Anti-parallel vectors. +SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "COSINE"); +DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "COSINE") +2 +SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "COSINE"); +DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "COSINE") +2 +# Distance from origin. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "COSINE") +NULL +# Mixed-sign and larger vectors. +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "COSINE") +0.025368153802923787 +SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "COSINE") +0.17366216073634244 +# Zero vector (behavior differs per metric). +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "COSINE") +NULL +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "COSINE"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "COSINE") +NULL +# Large values near float32 max. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "COSINE"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "COSINE") +NULL +# Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +# fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +# float32 overflows to +Inf, but the isfinite check falls back to scalar). +SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"COSINE"); +DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"COSINE") +NULL +# Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "COSINE") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "COSINE"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "COSINE") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "COSINE") +1 +# Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +# NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +SELECT DISTANCE(X'0000C07F', X'00000000', "COSINE"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000c07f,0x00000000,'COSINE')' +SELECT DISTANCE(X'0000807F', X'00000000', "COSINE"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000807f,0x00000000,'COSINE')' +SELECT DISTANCE(X'000080FF', X'00000000', "COSINE"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x000080ff,0x00000000,'COSINE')' +# Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +# Integer-valued diffs keep float32 partial sums exact, so results are +# identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +# 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"COSINE"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"COSINE") +1 +# 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +# AVX-512 1x16 + 4-elem scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"COSINE"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"COSINE") +1 +# +# 5) Distance in query contexts. +# +# ORDER BY distance: nearest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "COSINE"), id; +id +0 +1 +4 +3 +2 +# ORDER BY distance DESC: farthest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "COSINE") DESC, id; +id +2 +3 +1 +4 +0 +# WHERE: range query filtering by distance. +SELECT id FROM t1 +WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "COSINE") < 1.5 +ORDER BY id; +id +1 +2 +3 +4 +# Derived table with distance. +SELECT id FROM +(SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "COSINE") AS d +FROM t1 WHERE id IN (0,1,2,3,4)) AS sq +WHERE d IS NOT NULL ORDER BY d, id; +id +1 +4 +3 +2 +# Zero-vector cosine in DML: must insert NULL without aborting under strict sql_mode. +CREATE TABLE tt_cosine_dml (d DOUBLE); +INSERT INTO tt_cosine_dml SELECT DISTANCE(TO_VECTOR('[0]'), TO_VECTOR('[0]'), 'COSINE'); +SELECT d FROM tt_cosine_dml; +d +NULL +DROP TABLE tt_cosine_dml; +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/r/distance_dot.result b/mysql-test/suite/percona/r/distance_dot.result new file mode 100644 index 000000000000..718d5c62c7f7 --- /dev/null +++ b/mysql-test/suite/percona/r/distance_dot.result @@ -0,0 +1,339 @@ +# +# Test coverage for vector DISTANCE() function. +# +# +# 0) Prepare playground. +# +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), +(1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), +(2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), +(3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), +(4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), +(98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), +(99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); +# +# 1) Test how different number and types of arguments are handled. +# +# 1.1) Arity. +# +SELECT DISTANCE(); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "DOT"); +DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "DOT") +-2 +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +# +# 1.2) Argument types. +# +# Only vectors or binary strings are allowed for first the two arguments. +SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "DOT"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "DOT"); +DISTANCE(X'0000803F', TO_VECTOR("[2]"), "DOT") +-2 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "DOT"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "DOT") +-0 +SELECT DISTANCE(v1, TO_VECTOR("[2]"), "DOT") FROM t1 WHERE id = 0; +DISTANCE(v1, TO_VECTOR("[2]"), "DOT") +-0 +SELECT DISTANCE(id, TO_VECTOR("[2]"), "DOT") FROM t1 WHERE id = 0; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "DOT"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "DOT"); +DISTANCE(TO_VECTOR("[2]"), X'0000803F', "DOT") +-2 +SELECT DISTANCE(TO_VECTOR("[0]"), v1, "DOT") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0]"), v1, "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0]"), id, "DOT") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# The third argument must be a string literal with value from the +# fixed list of metric names. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN") +3 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E') +4 +# Metric strings with embedded NUL must be rejected regardless of prefix match. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +# +# 1.3) NULL arguments and nullability in metadata for result. +# +SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "DOT"); +DISTANCE(NULL, TO_VECTOR("[1, 0]"), "DOT") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), NULL, "DOT") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "DOT") FROM t1 WHERE id = 99; +DISTANCE(v2, TO_VECTOR("[1, 0]"), "DOT") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "DOT") FROM t1 WHERE id = 99; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "DOT") +NULL +# The third argument doesn't allow NULL values in any form. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +ERROR HY000: Incorrect arguments to distance +# The result metadata should indicate that it is nullable. +CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "DOT") AS d; +SHOW CREATE TABLE tt; +Table Create Table +tt CREATE TABLE `tt` ( + `d` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci +DROP TABLE tt; +# +# 2) Test vector arguments length mismatch. +# +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "DOT"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v2, TO_VECTOR("[1]"), "DOT") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v1, v2, "DOT") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# +# Note that length check happens at runtime. This is well visible +# when we have value stored in a vector field which is shorter than +# maximum length specified at the field creation time. +SELECT DISTANCE(v1, v2, "DOT") FROM t1 WHERE id = 98; +DISTANCE(v1, v2, "DOT") +-2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "DOT") FROM t1 WHERE id = 98; +ERROR HY000: Incorrect arguments to distance +# Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +# A BLOB column is used so the argument passes the resolve-time binary-charset +# type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +SELECT DISTANCE(v, v, "DOT") FROM t_oversized; +ERROR HY000: Incorrect arguments to distance +DROP TABLE t_oversized; +# +# 3) Some basic tests for different (from syntax PoV) variants of +# arguments. +# +SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "DOT"); +DISTANCE(X'0000000000000000', X'0000000000000040', "DOT") +-0 +SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "DOT"); +DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "DOT") +-0 +SELECT DISTANCE(X'0000000000000000', v2, "DOT") FROM t1 WHERE id = 4; +DISTANCE(X'0000000000000000', v2, "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "DOT") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "DOT") +-0 +SELECT DISTANCE(a.v2, b.v2, "DOT") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +DISTANCE(a.v2, b.v2, "DOT") +-0 +SELECT DISTANCE(v2, X'0000000000000040', "DOT") FROM t1 WHERE id = 0; +DISTANCE(v2, X'0000000000000040', "DOT") +-0 +SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "DOT") FROM t1 WHERE id = 0; +DISTANCE(v2, TO_VECTOR("[0, 2]"), "DOT") +-0 +# Non-trivial (artificial) combinations +SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "DOT"); +DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "DOT") +-2 +# The below case demonstrates that arguments to DISTANCE might not be +# well-aligned in memory. +SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "DOT"); +DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "DOT") +-2 +# 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +# Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "DOT"); +DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "DOT") +-2 +# +# 4) Basic test for different vector values. +# +# Identical / collinear vectors. +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "DOT") +-2 +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "DOT") +-1 +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "DOT") +-5 +SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "DOT") +-55 +# Orthogonal vectors. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "DOT") +-0 +# Anti-parallel vectors. +SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "DOT"); +DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "DOT") +4 +SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "DOT"); +DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "DOT") +3.999999744228558e76 +# Distance from origin. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "DOT") +-0 +# Mixed-sign and larger vectors. +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "DOT") +-32 +SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "DOT") +-113 +# Zero vector (behavior differs per metric). +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "DOT") +-0 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "DOT"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "DOT") +-0 +# Large values near float32 max. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "DOT"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "DOT") +-0 +# Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +# fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +# float32 overflows to +Inf, but the isfinite check falls back to scalar). +SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"DOT"); +DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"DOT") +-0 +# Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "DOT") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "DOT"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "DOT") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "DOT") +1 +# Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +# NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +SELECT DISTANCE(X'0000C07F', X'00000000', "DOT"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000c07f,0x00000000,'DOT')' +SELECT DISTANCE(X'0000807F', X'00000000', "DOT"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000807f,0x00000000,'DOT')' +SELECT DISTANCE(X'000080FF', X'00000000', "DOT"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x000080ff,0x00000000,'DOT')' +# Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +# Integer-valued diffs keep float32 partial sums exact, so results are +# identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +# 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"DOT"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"DOT") +-0 +# 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +# AVX-512 1x16 + 4-elem scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"DOT"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"DOT") +-0 +# +# 5) Distance in query contexts. +# +# ORDER BY distance: nearest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "DOT"), id; +id +4 +1 +3 +0 +2 +# ORDER BY distance DESC: farthest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "DOT") DESC, id; +id +0 +2 +1 +3 +4 +# WHERE: range query filtering by distance. +SELECT id FROM t1 +WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "DOT") < 1.5 +ORDER BY id; +id +0 +1 +2 +3 +4 +# Derived table with distance. +SELECT id FROM +(SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "DOT") AS d +FROM t1 WHERE id IN (0,1,2,3,4)) AS sq +WHERE d IS NOT NULL ORDER BY d, id; +id +4 +1 +3 +0 +2 +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/r/distance_euclidean.result b/mysql-test/suite/percona/r/distance_euclidean.result new file mode 100644 index 000000000000..1a483182af36 --- /dev/null +++ b/mysql-test/suite/percona/r/distance_euclidean.result @@ -0,0 +1,338 @@ +# +# Test coverage for vector DISTANCE() function. +# +# +# 0) Prepare playground. +# +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), +(1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), +(2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), +(3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), +(4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), +(98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), +(99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); +# +# 1) Test how different number and types of arguments are handled. +# +# 1.1) Arity. +# +SELECT DISTANCE(); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +# +# 1.2) Argument types. +# +# Only vectors or binary strings are allowed for first the two arguments. +SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "EUCLIDEAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "EUCLIDEAN"); +DISTANCE(X'0000803F', TO_VECTOR("[2]"), "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "EUCLIDEAN") +2 +SELECT DISTANCE(v1, TO_VECTOR("[2]"), "EUCLIDEAN") FROM t1 WHERE id = 0; +DISTANCE(v1, TO_VECTOR("[2]"), "EUCLIDEAN") +2 +SELECT DISTANCE(id, TO_VECTOR("[2]"), "EUCLIDEAN") FROM t1 WHERE id = 0; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "EUCLIDEAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[2]"), X'0000803F', "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), v1, "EUCLIDEAN") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0]"), v1, "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), id, "EUCLIDEAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# The third argument must be a string literal with value from the +# fixed list of metric names. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN") +3 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E') +4 +# Metric strings with embedded NUL must be rejected regardless of prefix match. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +# +# 1.3) NULL arguments and nullability in metadata for result. +# +SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "EUCLIDEAN"); +DISTANCE(NULL, TO_VECTOR("[1, 0]"), "EUCLIDEAN") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), NULL, "EUCLIDEAN") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "EUCLIDEAN") FROM t1 WHERE id = 99; +DISTANCE(v2, TO_VECTOR("[1, 0]"), "EUCLIDEAN") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN") FROM t1 WHERE id = 99; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN") +NULL +# The third argument doesn't allow NULL values in any form. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +ERROR HY000: Incorrect arguments to distance +# The result metadata should indicate that it is nullable. +CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN") AS d; +SHOW CREATE TABLE tt; +Table Create Table +tt CREATE TABLE `tt` ( + `d` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci +DROP TABLE tt; +# +# 2) Test vector arguments length mismatch. +# +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v2, TO_VECTOR("[1]"), "EUCLIDEAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v1, v2, "EUCLIDEAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# +# Note that length check happens at runtime. This is well visible +# when we have value stored in a vector field which is shorter than +# maximum length specified at the field creation time. +SELECT DISTANCE(v1, v2, "EUCLIDEAN") FROM t1 WHERE id = 98; +DISTANCE(v1, v2, "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN") FROM t1 WHERE id = 98; +ERROR HY000: Incorrect arguments to distance +# Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +# A BLOB column is used so the argument passes the resolve-time binary-charset +# type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +SELECT DISTANCE(v, v, "EUCLIDEAN") FROM t_oversized; +ERROR HY000: Incorrect arguments to distance +DROP TABLE t_oversized; +# +# 3) Some basic tests for different (from syntax PoV) variants of +# arguments. +# +SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "EUCLIDEAN"); +DISTANCE(X'0000000000000000', X'0000000000000040', "EUCLIDEAN") +2 +SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "EUCLIDEAN"); +DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "EUCLIDEAN") +2 +SELECT DISTANCE(X'0000000000000000', v2, "EUCLIDEAN") FROM t1 WHERE id = 4; +DISTANCE(X'0000000000000000', v2, "EUCLIDEAN") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN") +1 +SELECT DISTANCE(a.v2, b.v2, "EUCLIDEAN") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +DISTANCE(a.v2, b.v2, "EUCLIDEAN") +2 +SELECT DISTANCE(v2, X'0000000000000040', "EUCLIDEAN") FROM t1 WHERE id = 0; +DISTANCE(v2, X'0000000000000040', "EUCLIDEAN") +2 +SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "EUCLIDEAN") FROM t1 WHERE id = 0; +DISTANCE(v2, TO_VECTOR("[0, 2]"), "EUCLIDEAN") +2 +# Non-trivial (artificial) combinations +SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "EUCLIDEAN"); +DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "EUCLIDEAN") +1 +# The below case demonstrates that arguments to DISTANCE might not be +# well-aligned in memory. +SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "EUCLIDEAN"); +DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "EUCLIDEAN") +1 +# 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +# Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "EUCLIDEAN"); +DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "EUCLIDEAN") +1 +# +# 4) Basic test for different vector values. +# +# Identical / collinear vectors. +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "EUCLIDEAN") +0 +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN") +0 +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "EUCLIDEAN") +2.1213203435596424 +SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN") +0 +# Orthogonal vectors. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN") +1.4142135623730951 +SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "EUCLIDEAN") +1.7320508075688772 +SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "EUCLIDEAN") +7.416198487095663 +# Anti-parallel vectors. +SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN") +4.242640687119285 +SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "EUCLIDEAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(to_vector('[-2e38, 1]'),to_vector('[2e38, -1]'),'EUCLIDEAN')' +# Distance from origin. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "EUCLIDEAN") +5 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "EUCLIDEAN") +13 +SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "EUCLIDEAN") +2 +# Mixed-sign and larger vectors. +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN") +5.196152422706632 +SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN") +13 +# Zero vector (behavior differs per metric). +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN") +2.8284271247461903 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "EUCLIDEAN") +0 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "EUCLIDEAN") +0 +# Large values near float32 max. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "EUCLIDEAN") +1.9999999360571385e38 +# Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +# fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +# float32 overflows to +Inf, but the isfinite check falls back to scalar). +SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"EUCLIDEAN"); +DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"EUCLIDEAN") +1.9999999360571385e38 +# Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "EUCLIDEAN") +1 +# Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +# NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +SELECT DISTANCE(X'0000C07F', X'00000000', "EUCLIDEAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000c07f,0x00000000,'EUCLIDEAN')' +SELECT DISTANCE(X'0000807F', X'00000000', "EUCLIDEAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000807f,0x00000000,'EUCLIDEAN')' +SELECT DISTANCE(X'000080FF', X'00000000', "EUCLIDEAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x000080ff,0x00000000,'EUCLIDEAN')' +# Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +# Integer-valued diffs keep float32 partial sums exact, so results are +# identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +# 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN") +4 +# 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +# AVX-512 1x16 + 4-elem scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN") +4.47213595499958 +# +# 5) Distance in query contexts. +# +# ORDER BY distance: nearest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN"), id; +id +1 +0 +3 +4 +2 +# ORDER BY distance DESC: farthest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN") DESC, id; +id +2 +0 +3 +4 +1 +# WHERE: range query filtering by distance. +SELECT id FROM t1 +WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN") < 1.5 +ORDER BY id; +id +0 +1 +2 +3 +4 +# Derived table with distance. +SELECT id FROM +(SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN") AS d +FROM t1 WHERE id IN (0,1,2,3,4)) AS sq +WHERE d IS NOT NULL ORDER BY d, id; +id +1 +0 +3 +4 +2 +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/r/distance_euclidean_squared.result b/mysql-test/suite/percona/r/distance_euclidean_squared.result new file mode 100644 index 000000000000..7824fd08435d --- /dev/null +++ b/mysql-test/suite/percona/r/distance_euclidean_squared.result @@ -0,0 +1,337 @@ +# +# Test coverage for vector DISTANCE() function. +# +# +# 0) Prepare playground. +# +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), +(1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), +(2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), +(3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), +(4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), +(98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), +(99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); +# +# 1) Test how different number and types of arguments are handled. +# +# 1.1) Arity. +# +SELECT DISTANCE(); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +# +# 1.2) Argument types. +# +# Only vectors or binary strings are allowed for first the two arguments. +SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED"); +DISTANCE(X'0000803F', TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(v1, TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 0; +DISTANCE(v1, TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(id, TO_VECTOR("[2]"), "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 0; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "EUCLIDEAN_SQUARED"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[2]"), X'0000803F', "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), v1, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0]"), v1, "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), id, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# The third argument must be a string literal with value from the +# fixed list of metric names. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN") +3 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E') +4 +# Metric strings with embedded NUL must be rejected regardless of prefix match. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +# +# 1.3) NULL arguments and nullability in metadata for result. +# +SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(NULL, TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), NULL, "EUCLIDEAN_SQUARED") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 99; +DISTANCE(v2, TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 99; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN_SQUARED") +NULL +# The third argument doesn't allow NULL values in any form. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +ERROR HY000: Incorrect arguments to distance +# The result metadata should indicate that it is nullable. +CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN_SQUARED") AS d; +SHOW CREATE TABLE tt; +Table Create Table +tt CREATE TABLE `tt` ( + `d` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci +DROP TABLE tt; +# +# 2) Test vector arguments length mismatch. +# +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v2, TO_VECTOR("[1]"), "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v1, v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# +# Note that length check happens at runtime. This is well visible +# when we have value stored in a vector field which is shorter than +# maximum length specified at the field creation time. +SELECT DISTANCE(v1, v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 98; +DISTANCE(v1, v2, "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 98; +ERROR HY000: Incorrect arguments to distance +# Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +# A BLOB column is used so the argument passes the resolve-time binary-charset +# type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +SELECT DISTANCE(v, v, "EUCLIDEAN_SQUARED") FROM t_oversized; +ERROR HY000: Incorrect arguments to distance +DROP TABLE t_oversized; +# +# 3) Some basic tests for different (from syntax PoV) variants of +# arguments. +# +SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "EUCLIDEAN_SQUARED"); +DISTANCE(X'0000000000000000', X'0000000000000040', "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(X'0000000000000000', v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 4; +DISTANCE(X'0000000000000000', v2, "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(a.v2, b.v2, "EUCLIDEAN_SQUARED") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +DISTANCE(a.v2, b.v2, "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(v2, X'0000000000000040', "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 0; +DISTANCE(v2, X'0000000000000040', "EUCLIDEAN_SQUARED") +4 +SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "EUCLIDEAN_SQUARED") FROM t1 WHERE id = 0; +DISTANCE(v2, TO_VECTOR("[0, 2]"), "EUCLIDEAN_SQUARED") +4 +# Non-trivial (artificial) combinations +SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "EUCLIDEAN_SQUARED") +1 +# The below case demonstrates that arguments to DISTANCE might not be +# well-aligned in memory. +SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "EUCLIDEAN_SQUARED"); +DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "EUCLIDEAN_SQUARED") +1 +# 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +# Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "EUCLIDEAN_SQUARED"); +DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "EUCLIDEAN_SQUARED") +1 +# +# 4) Basic test for different vector values. +# +# Identical / collinear vectors. +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "EUCLIDEAN_SQUARED") +0 +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") +0 +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "EUCLIDEAN_SQUARED") +4.5 +SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN_SQUARED") +0 +# Orthogonal vectors. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "EUCLIDEAN_SQUARED") +2 +SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "EUCLIDEAN_SQUARED") +3 +SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "EUCLIDEAN_SQUARED") +55 +# Anti-parallel vectors. +SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN_SQUARED") +18 +SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "EUCLIDEAN_SQUARED"); +ERROR 22003: DOUBLE value is out of range in 'distance(to_vector('[-2e38, 1]'),to_vector('[2e38, -1]'),'EUCLIDEAN_SQUARED')' +# Distance from origin. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "EUCLIDEAN_SQUARED") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "EUCLIDEAN_SQUARED") +25 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "EUCLIDEAN_SQUARED") +169 +SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "EUCLIDEAN_SQUARED") +4 +# Mixed-sign and larger vectors. +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN_SQUARED") +27 +SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "EUCLIDEAN_SQUARED") +169 +# Zero vector (behavior differs per metric). +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "EUCLIDEAN_SQUARED") +8 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "EUCLIDEAN_SQUARED") +0 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "EUCLIDEAN_SQUARED") +0 +# Large values near float32 max. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "EUCLIDEAN_SQUARED") +3.999999744228558e76 +# Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +# fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +# float32 overflows to +Inf, but the isfinite check falls back to scalar). +SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"EUCLIDEAN_SQUARED") +3.999999744228558e76 +# Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN_SQUARED") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "EUCLIDEAN_SQUARED") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "EUCLIDEAN_SQUARED") +1 +# Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +# NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +SELECT DISTANCE(X'0000C07F', X'00000000', "EUCLIDEAN_SQUARED"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000c07f,0x00000000,'EUCLIDEAN_SQUARED')' +SELECT DISTANCE(X'0000807F', X'00000000', "EUCLIDEAN_SQUARED"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000807f,0x00000000,'EUCLIDEAN_SQUARED')' +SELECT DISTANCE(X'000080FF', X'00000000', "EUCLIDEAN_SQUARED"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x000080ff,0x00000000,'EUCLIDEAN_SQUARED')' +# Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +# Integer-valued diffs keep float32 partial sums exact, so results are +# identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +# 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN_SQUARED") +16 +# 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +# AVX-512 1x16 + 4-elem scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN_SQUARED"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"EUCLIDEAN_SQUARED") +20 +# +# 5) Distance in query contexts. +# +# ORDER BY distance: nearest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN_SQUARED"), id; +id +1 +0 +3 +4 +2 +# ORDER BY distance DESC: farthest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN_SQUARED") DESC, id; +id +2 +0 +3 +4 +1 +# WHERE: range query filtering by distance. +SELECT id FROM t1 +WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN_SQUARED") < 1.5 +ORDER BY id; +id +0 +1 +3 +4 +# Derived table with distance. +SELECT id FROM +(SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "EUCLIDEAN_SQUARED") AS d +FROM t1 WHERE id IN (0,1,2,3,4)) AS sq +WHERE d IS NOT NULL ORDER BY d, id; +id +1 +0 +3 +4 +2 +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/r/distance_manhattan.result b/mysql-test/suite/percona/r/distance_manhattan.result new file mode 100644 index 000000000000..4e102449b3dc --- /dev/null +++ b/mysql-test/suite/percona/r/distance_manhattan.result @@ -0,0 +1,338 @@ +# +# Test coverage for vector DISTANCE() function. +# +# +# 0) Prepare playground. +# +CREATE TABLE t1 (id INT PRIMARY KEY, v1 VECTOR(1), v2 VECTOR(2)); +INSERT INTO t1 VALUES (0, TO_VECTOR('[0]'), TO_VECTOR('[0, 0]')), +(1, TO_VECTOR('[1]'), TO_VECTOR('[1, 0]')), +(2, TO_VECTOR('[1]'), TO_VECTOR('[0, 1]')), +(3, TO_VECTOR('[2]'), TO_VECTOR('[1, 1]')), +(4, TO_VECTOR('[2]'), TO_VECTOR('[2, 0]')), +(98, TO_VECTOR('[1]'), TO_VECTOR('[2]')), +(99, NULL, NULL); +CREATE TABLE t_metric_name (id INT PRIMARY KEY, name VARCHAR(10)); +INSERT INTO t_metric_name VALUES (1, "EUCLIDEAN"), (99, NULL); +# +# 1) Test how different number and types of arguments are handled. +# +# 1.1) Arity. +# +SELECT DISTANCE(); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]")); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[2]"), TO_VECTOR("[3]"), "EUCLIDEAN"); +ERROR 42000: Incorrect parameter count in the call to native function 'DISTANCE' +# +# 1.2) Argument types. +# +# Only vectors or binary strings are allowed for first the two arguments. +SELECT DISTANCE("[1]", TO_VECTOR("[2]"), "MANHATTAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(X'0000803F', TO_VECTOR("[2]"), "MANHATTAN"); +DISTANCE(X'0000803F', TO_VECTOR("[2]"), "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[2]"), "MANHATTAN") +2 +SELECT DISTANCE(v1, TO_VECTOR("[2]"), "MANHATTAN") FROM t1 WHERE id = 0; +DISTANCE(v1, TO_VECTOR("[2]"), "MANHATTAN") +2 +SELECT DISTANCE(id, TO_VECTOR("[2]"), "MANHATTAN") FROM t1 WHERE id = 0; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[1]"), "[2]", "MANHATTAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[2]"), X'0000803F', "MANHATTAN"); +DISTANCE(TO_VECTOR("[2]"), X'0000803F', "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), v1, "MANHATTAN") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0]"), v1, "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0]"), id, "MANHATTAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# The third argument must be a string literal with value from the +# fixed list of metric names. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[-1, 0]"), 1); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), CONCAT("EUCLI","DEAN")); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 0]"), "euclidean") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 0]"), "EuClIdEaN") +3 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E'); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[4, 0]"), X'4555434C494445414E') +4 +# Metric strings with embedded NUL must be rejected regardless of prefix match. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E00'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), X'4555434C494445414E004A554E4B'); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 0]"), "NOSUCHMETRIC"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[6, 0]"), name) FROM t_metric_name WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[7, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +# +# 1.3) NULL arguments and nullability in metadata for result. +# +SELECT DISTANCE(NULL, TO_VECTOR("[1, 0]"), "MANHATTAN"); +DISTANCE(NULL, TO_VECTOR("[1, 0]"), "MANHATTAN") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), NULL, "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), NULL, "MANHATTAN") +NULL +SELECT DISTANCE(v2, TO_VECTOR("[1, 0]"), "MANHATTAN") FROM t1 WHERE id = 99; +DISTANCE(v2, TO_VECTOR("[1, 0]"), "MANHATTAN") +NULL +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "MANHATTAN") FROM t1 WHERE id = 99; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "MANHATTAN") +NULL +# The third argument doesn't allow NULL values in any form. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), NULL); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), name) FROM t_metric_name WHERE id = 99; +ERROR HY000: Incorrect arguments to distance +# The result metadata should indicate that it is nullable. +CREATE TABLE tt SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "MANHATTAN") AS d; +SHOW CREATE TABLE tt; +Table Create Table +tt CREATE TABLE `tt` ( + `d` double DEFAULT NULL +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci +DROP TABLE tt; +# +# 2) Test vector arguments length mismatch. +# +SELECT DISTANCE(TO_VECTOR("[1]"), TO_VECTOR("[1, 0]"), "MANHATTAN"); +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v2, TO_VECTOR("[1]"), "MANHATTAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +SELECT DISTANCE(v1, v2, "MANHATTAN") FROM t1 WHERE id = 1; +ERROR HY000: Incorrect arguments to distance +# +# Note that length check happens at runtime. This is well visible +# when we have value stored in a vector field which is shorter than +# maximum length specified at the field creation time. +SELECT DISTANCE(v1, v2, "MANHATTAN") FROM t1 WHERE id = 98; +DISTANCE(v1, v2, "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "MANHATTAN") FROM t1 WHERE id = 98; +ERROR HY000: Incorrect arguments to distance +# Binary-string BLOB arguments exceeding max_dimensions (16383) are rejected. +# A BLOB column is used so the argument passes the resolve-time binary-charset +# type check; the max_dimensions guard fires at runtime in val_real(). +CREATE TABLE t_oversized (v MEDIUMBLOB); +INSERT INTO t_oversized VALUES (REPEAT(X'00000000', 16384)); +SELECT DISTANCE(v, v, "MANHATTAN") FROM t_oversized; +ERROR HY000: Incorrect arguments to distance +DROP TABLE t_oversized; +# +# 3) Some basic tests for different (from syntax PoV) variants of +# arguments. +# +SELECT DISTANCE(X'0000000000000000', X'0000000000000040', "MANHATTAN"); +DISTANCE(X'0000000000000000', X'0000000000000040', "MANHATTAN") +2 +SELECT DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "MANHATTAN"); +DISTANCE(X'0000000000000000', TO_VECTOR("[2, 0]"), "MANHATTAN") +2 +SELECT DISTANCE(X'0000000000000000', v2, "MANHATTAN") FROM t1 WHERE id = 4; +DISTANCE(X'0000000000000000', v2, "MANHATTAN") +2 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), X'000000000000803F', "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), v2, "MANHATTAN") FROM t1 WHERE id = 1; +DISTANCE(TO_VECTOR("[0, 0]"), v2, "MANHATTAN") +1 +SELECT DISTANCE(a.v2, b.v2, "MANHATTAN") FROM t1 AS a, t1 AS b WHERE a.id = 0 AND b.id = 4; +DISTANCE(a.v2, b.v2, "MANHATTAN") +2 +SELECT DISTANCE(v2, X'0000000000000040', "MANHATTAN") FROM t1 WHERE id = 0; +DISTANCE(v2, X'0000000000000040', "MANHATTAN") +2 +SELECT DISTANCE(v2, TO_VECTOR("[0, 2]"), "MANHATTAN") FROM t1 WHERE id = 0; +DISTANCE(v2, TO_VECTOR("[0, 2]"), "MANHATTAN") +2 +# Non-trivial (artificial) combinations +SELECT DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "MANHATTAN"); +DISTANCE(TO_VECTOR(CONCAT("[0", ", ", "1]")), CONCAT(X'00000000', X'00000040'), "MANHATTAN") +1 +# The below case demonstrates that arguments to DISTANCE might not be +# well-aligned in memory. +SELECT DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "MANHATTAN"); +DISTANCE(SUBSTR(X'010000000000000040', 2), RIGHT(X'40000000000000803F', 8), "MANHATTAN") +1 +# 9-byte blobs; SUBSTR from pos 2 → 8 bytes at offset 1 (misaligned for float). +# Length must stay a multiple of 4; SUBSTR(..., 4) on 9 bytes yields 6 → ER_TO_VECTOR_CONVERSION. +SELECT DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "MANHATTAN"); +DISTANCE(SUBSTR(X'000100000000000040', 2), SUBSTR(X'00040000000000803F', 2), "MANHATTAN") +1 +# +# 4) Basic test for different vector values. +# +# Identical / collinear vectors. +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[1, 1]"), "MANHATTAN") +0 +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN") +0 +SELECT DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 1]"), TO_VECTOR("[2.5, 2.5]"), "MANHATTAN") +3 +SELECT DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 2, 3, 4, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "MANHATTAN") +0 +# Orthogonal vectors. +SELECT DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 0]"), TO_VECTOR("[0, 1]"), "MANHATTAN") +2 +SELECT DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 1, 0]"), TO_VECTOR("[-1, 0, -1]"), "MANHATTAN") +3 +SELECT DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 0, 3, 0, 5]"), TO_VECTOR("[0, 2, 0, 4, 0]"), "MANHATTAN") +15 +# Anti-parallel vectors. +SELECT DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[-1, -1]"), TO_VECTOR("[2, 2]"), "MANHATTAN") +6 +SELECT DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[-2e38, 1]"), TO_VECTOR("[2e38, -1]"), "MANHATTAN") +3.999999872114277e38 +# Distance from origin. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[1, 0]"), "MANHATTAN") +1 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[3, 4]"), "MANHATTAN") +7 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[5, 12]"), "MANHATTAN") +17 +SELECT DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0, 0, 0]"), TO_VECTOR("[1, 1, 1, 1]"), "MANHATTAN") +4 +# Mixed-sign and larger vectors. +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "MANHATTAN") +9 +SELECT DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 7, 3, 16, 5]"), TO_VECTOR("[1, 2, 3, 4, 5]"), "MANHATTAN") +17 +# Zero vector (behavior differs per metric). +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2, 2]"), "MANHATTAN") +4 +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[0, 0]"), "MANHATTAN") +0 +SELECT DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0]"), TO_VECTOR("[0]"), "MANHATTAN") +0 +# Large values near float32 max. +SELECT DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[0, 0]"), TO_VECTOR("[2e38, 0]"), "MANHATTAN") +1.9999999360571385e38 +# Same value in a 16-dim vector: exercises the wide-tier SIMD overflow +# fallback (dims >= 16 dispatches to the wide kernel; squaring 2e38 in +# float32 overflows to +Inf, but the isfinite check falls back to scalar). +SELECT DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"MANHATTAN"); +DISTANCE(TO_VECTOR("[2e38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +TO_VECTOR("[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"), +"MANHATTAN") +1.9999999360571385e38 +# Symmetry: DISTANCE(a, b) = DISTANCE(b, a). +SELECT DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "MANHATTAN") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "MANHATTAN"); +DISTANCE(TO_VECTOR("[1, 2, 3]"), TO_VECTOR("[4, 5, 6]"), "MANHATTAN") = +DISTANCE(TO_VECTOR("[4, 5, 6]"), TO_VECTOR("[1, 2, 3]"), "MANHATTAN") +1 +# Special IEEE 754 float32 values: NaN, +Infinity, -Infinity. +# NaN/Inf input elements raise ER_DATA_OUT_OF_RANGE for all metrics (POW/EXP convention). +SELECT DISTANCE(X'0000C07F', X'00000000', "MANHATTAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000c07f,0x00000000,'MANHATTAN')' +SELECT DISTANCE(X'0000807F', X'00000000', "MANHATTAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x0000807f,0x00000000,'MANHATTAN')' +SELECT DISTANCE(X'000080FF', X'00000000', "MANHATTAN"); +ERROR 22003: DOUBLE value is out of range in 'distance(0x000080ff,0x00000000,'MANHATTAN')' +# Wide-tier SIMD path coverage (dims >= 16 dispatches to the wide kernel). +# Integer-valued diffs keep float32 partial sums exact, so results are +# identical across Scalar / SSE4.2 / NEON / AVX2 / AVX-512 / SVE2. +# 16-dim: fills one AVX-512 register / two AVX2 / four SSE4.2 -- no scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"MANHATTAN"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"MANHATTAN") +16 +# 20-dim: SSE4.2 5x4 (no tail); AVX2 2x8 + 4-elem scalar tail; +# AVX-512 1x16 + 4-elem scalar tail. +SELECT DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"MANHATTAN"); +DISTANCE(TO_VECTOR("[1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]"), +TO_VECTOR("[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]"), +"MANHATTAN") +20 +# +# 5) Distance in query contexts. +# +# ORDER BY distance: nearest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "MANHATTAN"), id; +id +1 +0 +3 +4 +2 +# ORDER BY distance DESC: farthest-neighbour pattern. +SELECT id FROM t1 WHERE id IN (0,1,2,3,4) +ORDER BY DISTANCE(v2, TO_VECTOR('[1, 0]'), "MANHATTAN") DESC, id; +id +2 +0 +3 +4 +1 +# WHERE: range query filtering by distance. +SELECT id FROM t1 +WHERE id IN (0,1,2,3,4) AND DISTANCE(v2, TO_VECTOR('[1, 0]'), "MANHATTAN") < 1.5 +ORDER BY id; +id +0 +1 +3 +4 +# Derived table with distance. +SELECT id FROM +(SELECT id, DISTANCE(v2, TO_VECTOR('[1, 0]'), "MANHATTAN") AS d +FROM t1 WHERE id IN (0,1,2,3,4)) AS sq +WHERE d IS NOT NULL ORDER BY d, id; +id +1 +0 +3 +4 +2 +DROP TABLE t_metric_name; +DROP TABLE t1; diff --git a/mysql-test/suite/percona/t/distance_cosine.test b/mysql-test/suite/percona/t/distance_cosine.test new file mode 100644 index 000000000000..3875a94aacfe --- /dev/null +++ b/mysql-test/suite/percona/t/distance_cosine.test @@ -0,0 +1,2 @@ +let $metric = COSINE; +--source ../include/distance.inc diff --git a/mysql-test/suite/percona/t/distance_dot.test b/mysql-test/suite/percona/t/distance_dot.test new file mode 100644 index 000000000000..bf77176908f5 --- /dev/null +++ b/mysql-test/suite/percona/t/distance_dot.test @@ -0,0 +1,2 @@ +let $metric = DOT; +--source ../include/distance.inc diff --git a/mysql-test/suite/percona/t/distance_euclidean.test b/mysql-test/suite/percona/t/distance_euclidean.test new file mode 100644 index 000000000000..ecb97385dcb8 --- /dev/null +++ b/mysql-test/suite/percona/t/distance_euclidean.test @@ -0,0 +1,2 @@ +let $metric = EUCLIDEAN; +--source ../include/distance.inc diff --git a/mysql-test/suite/percona/t/distance_euclidean_squared.test b/mysql-test/suite/percona/t/distance_euclidean_squared.test new file mode 100644 index 000000000000..a5b0f6552c1e --- /dev/null +++ b/mysql-test/suite/percona/t/distance_euclidean_squared.test @@ -0,0 +1,2 @@ +let $metric = EUCLIDEAN_SQUARED; +--source ../include/distance.inc diff --git a/mysql-test/suite/percona/t/distance_manhattan.test b/mysql-test/suite/percona/t/distance_manhattan.test new file mode 100644 index 000000000000..76edbf2a6b43 --- /dev/null +++ b/mysql-test/suite/percona/t/distance_manhattan.test @@ -0,0 +1,2 @@ +let $metric = MANHATTAN; +--source ../include/distance.inc diff --git a/share/messages_to_error_log.txt b/share/messages_to_error_log.txt index 931b2d731ae3..c69f9dfbd916 100644 --- a/share/messages_to_error_log.txt +++ b/share/messages_to_error_log.txt @@ -14049,6 +14049,9 @@ start-error-number 48300 # start-error-number 48400 +ER_VECTOR_DISTANCE_SIMD_DISPATCH + eng "%s" + # # End of Percona Server 9.7 error log messages # diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index b1fe25b17847..5d741cda1d5b 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -368,6 +368,7 @@ SET(SQL_SHARED_SOURCES auth/sha2_password_common.cc auth/sha2_password.cc ../vector-common/vector_conversion.cc + ../vector-common/vector_distance.cc ssl_wrapper_service.cc bootstrap.cc check_stack.cc diff --git a/sql/item_create.cc b/sql/item_create.cc index 02a5cd6f6cb9..2c04b5ef2584 100644 --- a/sql/item_create.cc +++ b/sql/item_create.cc @@ -1400,6 +1400,7 @@ static const std::pair func_array[] = { {"DAYOFWEEK", SQL_FACTORY(Dayofweek_instantiator)}, {"DAYOFYEAR", SQL_FN(Item_func_dayofyear, 1)}, {"DEGREES", SQL_FN(Item_func_degrees, 1)}, + {"DISTANCE", SQL_FN(Item_func_vector_distance, 3)}, {"ELT", SQL_FN_V(Item_func_elt, 2, MAX_ARGLIST_SIZE)}, {"ETAG", SQL_FN_V(Item_func_etag, 1, MAX_ARGLIST_SIZE)}, {"EXP", SQL_FN(Item_func_exp, 1)}, @@ -1657,6 +1658,7 @@ static const std::pair func_array[] = { {"FROM_VECTOR", SQL_FN(Item_func_from_vector, 1)}, {"VECTOR_TO_STRING", SQL_FN(Item_func_from_vector, 1)}, {"VECTOR_DIM", SQL_FN(Item_func_vector_dim, 1)}, + {"VECTOR_DISTANCE", SQL_FN(Item_func_vector_distance, 3)}, {"UCASE", SQL_FN(Item_func_upper, 1)}, {"UNCOMPRESS", SQL_FN(Item_func_uncompress, 1)}, {"UNCOMPRESSED_LENGTH", SQL_FN(Item_func_uncompressed_length, 1)}, diff --git a/sql/item_func.h b/sql/item_func.h index e36f51d8ba8d..d0ef3ed1b547 100644 --- a/sql/item_func.h +++ b/sql/item_func.h @@ -364,6 +364,7 @@ class Item_func : public Item_result_field { ETAG_FUNC, CURRENT_USER_IN_FUNC, CURRENT_ROLE_IN_FUNC, + VECTOR_DISTANCE_FUNC }; enum optimize_type { OPTIMIZE_NONE, @@ -858,6 +859,11 @@ class Item_real_func : public Item_func { set_data_type_double(); } + Item_real_func(const POS &pos, Item *a, Item *b, Item *c) + : Item_func(pos, a, b, c) { + set_data_type_double(); + } + explicit Item_real_func(mem_root_deque *list) : Item_func(list) { set_data_type_double(); } diff --git a/sql/item_strfunc.cc b/sql/item_strfunc.cc index f819473493bf..c30a03a2a1c6 100644 --- a/sql/item_strfunc.cc +++ b/sql/item_strfunc.cc @@ -37,7 +37,7 @@ #include #include #include -#include // std::isfinite +#include // std::isfinite, std::isnan #include // size_t #include #include @@ -135,6 +135,7 @@ #include "typelib.h" #include "unhex.h" #include "vector-common/vector_conversion.h" // from_string_to_vector, from_vector_to_string +#include "vector-common/vector_distance.h" // vector_distance_euclidean_squared, vector_distance_cosine, vector_distance_dot extern uint *my_aes_opmode_key_sizes; @@ -4270,6 +4271,158 @@ String *Item_func_from_vector::val_str_ascii(String *str) { return &buffer; } +static const char *kDistanceMetricNames[] = { + "euclidean", "euclidean_squared", "cosine", "dot", "manhattan", nullptr}; +static unsigned int kDistanceMetricLengths[] = { + std::string_view{"euclidean"}.size(), + std::string_view{"euclidean_squared"}.size(), + std::string_view{"cosine"}.size(), + std::string_view{"dot"}.size(), + std::string_view{"manhattan"}.size(), + 0}; +static TYPELIB kDistanceMetricTypelib = { + array_elements(kDistanceMetricNames) - 1, "", kDistanceMetricNames, + kDistanceMetricLengths}; + +bool Item_func_vector_distance::do_itemize(Parse_context *pc, Item **res) { + if (skip_itemize(res)) return false; + if (Item_real_func::do_itemize(pc, res)) return true; + pc->thd->lex->set_stmt_unsafe(LEX::BINLOG_STMT_UNSAFE_SYSTEM_FUNCTION); + pc->thd->lex->set_uncacheable(pc->select, UNCACHEABLE_RAND); + return false; +} + +bool Item_func_vector_distance::resolve_type(THD *thd) { + if (param_type_is_default(thd, 0, 2, MYSQL_TYPE_VECTOR)) { + return true; + } + + for (uint i = 0; i < 2; ++i) { + if (!(args[i]->data_type() == MYSQL_TYPE_VECTOR || + (args[i]->result_type() == STRING_RESULT && + args[i]->collation.collation == &my_charset_bin))) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return true; + } + } + + // Let us prohibit non-literal metric names right away, to make + // optimizer life easier. This is not something going to happen + // in practice anyway. + if (!args[2]->basic_const_item()) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return true; + } + + String tmp, *metric_n = args[2]->val_str_ascii(&tmp); + + if (metric_n == nullptr) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return true; + } + + // find_type2 is length-aware: embedded NUL + trailing bytes cannot match. + switch (find_type2(&kDistanceMetricTypelib, metric_n->ptr(), + metric_n->length(), &my_charset_latin1)) { + case 1: + m_metric = EUCLIDEAN; + break; + case 2: + m_metric = EUCLIDEAN_SQUARED; + break; + case 3: + m_metric = COSINE; + break; + case 4: + m_metric = DOT_PRODUCT; + break; + case 5: + m_metric = MANHATTAN; + break; + default: + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return true; + } + + // Cosine can return NULL for zero-length vectors at runtime. Mark nullable + // unconditionally so that derived columns and metadata (SHOW CREATE TABLE) + // reflect the true nullability of the function regardless of whether the + // input arguments are themselves nullable. + set_nullable(true); + + return false; +} + +double Item_func_vector_distance::val_real() { + assert(fixed); + null_value = false; + + String buff_a, buff_b; + String *a = args[0]->val_str(&buff_a); + if (a == nullptr || a->ptr() == nullptr) { + return error_real(); + } + + uint32 a_dims = get_dimensions(a->length(), Field_vector::precision); + if (a_dims == UINT32_MAX) { + my_error(ER_TO_VECTOR_CONVERSION, MYF(0), a->length(), a->ptr()); + return error_real(); + } + if (a_dims > Field_vector::max_dimensions) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return error_real(); + } + + String *b = args[1]->val_str(&buff_b); + if (b == nullptr || b->ptr() == nullptr) { + return error_real(); + } + + uint32 b_dims = get_dimensions(b->length(), Field_vector::precision); + if (b_dims == UINT32_MAX) { + my_error(ER_TO_VECTOR_CONVERSION, MYF(0), b->length(), b->ptr()); + return error_real(); + } + if (b_dims > Field_vector::max_dimensions) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return error_real(); + } + + if (a_dims != b_dims) { + my_error(ER_WRONG_ARGUMENTS, MYF(0), func_name()); + return error_real(); + } + + switch (m_metric) { + case EUCLIDEAN: + return check_float_overflow(std::sqrt( + vector_distance_euclidean_squared(a->ptr(), b->ptr(), a_dims))); + case EUCLIDEAN_SQUARED: + return check_float_overflow( + vector_distance_euclidean_squared(a->ptr(), b->ptr(), a_dims)); + case COSINE: { + const double dist = vector_distance_cosine(a->ptr(), b->ptr(), a_dims); + if (std::isinf(dist)) { + // +Inf sentinel from vector_distance_cosine: zero-vector(s) → undefined + // cosine → NULL + null_value = true; + return 0.0; + } + // NaN/Inf input elements propagated through → ER_DATA_OUT_OF_RANGE + return check_float_overflow(dist); + } + case DOT_PRODUCT: + return check_float_overflow( + -vector_distance_dot(a->ptr(), b->ptr(), a_dims)); + case MANHATTAN: + return check_float_overflow( + vector_distance_manhattan(a->ptr(), b->ptr(), a_dims)); + default: + assert(false); + return 0.0; + } +} + String *Item_func_uncompress::val_str(String *str) { assert(fixed); String *res = args[0]->val_str(str); diff --git a/sql/item_strfunc.h b/sql/item_strfunc.h index 14e6334d4554..941e3ce6512f 100644 --- a/sql/item_strfunc.h +++ b/sql/item_strfunc.h @@ -1310,6 +1310,45 @@ class Item_func_from_vector final : public Item_str_ascii_func { String *val_str_ascii(String *str) override; }; +class Item_func_vector_distance final : public Item_real_func { + enum metric_type { + EUCLIDEAN, + EUCLIDEAN_SQUARED, + COSINE, + DOT_PRODUCT, + MANHATTAN + }; + metric_type m_metric{EUCLIDEAN}; + + public: + Item_func_vector_distance(const POS &pos, Item *a, Item *b, Item *c) + : Item_real_func(pos, a, b, c) {} + bool do_itemize(Parse_context *pc, Item **res) override; + bool resolve_type(THD *thd) override; + /** + This function is non-deterministic: ANN vector index searches are + approximate (results may vary between executions), and SIMD/FMA + floating-point operations may produce slightly different values + depending on hardware and compiler optimization. + + @returns RAND_TABLE_BIT + */ + table_map get_initial_pseudo_tables() const override { + return RAND_TABLE_BIT; + } + const char *func_name() const override { return "distance"; } + enum Functype functype() const override { return VECTOR_DISTANCE_FUNC; } + double val_real() override; + bool check_function_as_value_generator(uchar *checker_args) override { + Check_function_as_value_generator_parameters *func_arg = + pointer_cast( + checker_args); + func_arg->banned_function_name = func_name(); + return ((func_arg->source == VGS_GENERATED_COLUMN) || + (func_arg->source == VGS_CHECK_CONSTRAINT)); + } +}; + class Item_func_uncompress final : public Item_str_func { String buffer; diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 5ca85725e42d..2fdc058d846b 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -927,6 +927,7 @@ MySQL clients support the protocol: #include "thr_lock.h" #include "thr_mutex.h" #include "typelib.h" +#include "vector-common/vector_distance.h" // init_vector_distance_functions #include "violite.h" #ifdef WITH_PERFSCHEMA_STORAGE_ENGINE @@ -8427,6 +8428,7 @@ static int init_server_components() { We need to call each of these following functions to ensure that all things are initialized so that unireg_abort() doesn't fail */ + init_vector_distance_functions(); mdl_init(); partitioning_init(); if (table_def_init() || hostname_cache_init(host_cache_size)) @@ -8505,6 +8507,14 @@ static int init_server_components() { */ if (setup_error_log_components()) unireg_abort(MYSQLD_ABORT_EXIT); + if (!is_help_or_validate_option()) { + char vector_distance_msg[256]; + vector_distance_dispatch_description(vector_distance_msg, + sizeof(vector_distance_msg)); + LogErr(INFORMATION_LEVEL, ER_VECTOR_DISTANCE_SIMD_DISPATCH, + vector_distance_msg); + } + if (MDL_context_backup_manager::init()) { LogErr(ERROR_LEVEL, ER_OOM); unireg_abort(MYSQLD_ABORT_EXIT); diff --git a/unittest/gunit/CMakeLists.txt b/unittest/gunit/CMakeLists.txt index d86102bc19d4..426fbbbbbd3d 100644 --- a/unittest/gunit/CMakeLists.txt +++ b/unittest/gunit/CMakeLists.txt @@ -182,6 +182,7 @@ SET(TESTS unhex utf8alias val_int_compare + vector_distance ) LIST(TRANSFORM TESTS APPEND "-t.cc" OUTPUT_VARIABLE ALL_SMALL_TESTS) @@ -365,6 +366,7 @@ DISABLE_MISSING_PROFILE_WARNING() LIST(TRANSFORM SERVER_TESTS APPEND "-t.cc" OUTPUT_VARIABLE ALL_LARGE_TESTS) SET(SQL_GUNIT_LIB_SOURCE + ${CMAKE_SOURCE_DIR}/vector-common/vector_distance.cc ${CMAKE_SOURCE_DIR}/sql/filesort_utils.cc ${CMAKE_SOURCE_DIR}/sql/mdl.cc ${CMAKE_SOURCE_DIR}/sql/sql_list.cc @@ -422,6 +424,16 @@ FOREACH(test ${TESTS}) ) ENDFOREACH() +# vector_distance per-tier benchmark — all SIMD tiers × 6 sizes × euclidean/cosine/dot_product. +# Unavailable tiers (wrong ISA or CPU) are skipped at runtime via GTEST_SKIP(). +MYSQL_ADD_EXECUTABLE(vector_distance_benchmark-t vector_distance_benchmark-t.cc + COMPILE_DEFINITIONS ${DISABLE_PSI_DEFINITIONS} + ENABLE_EXPORTS + EXCLUDE_FROM_ALL + LINK_LIBRARIES sqlgunitlib gunit_small extra::boost + SKIP_INSTALL +) + # Disable by default, since it dumps a stack trace. # We don't want ppl to think there was a segfault or something. # See also the mtr test main.print_stacktrace diff --git a/unittest/gunit/vector_distance-t.cc b/unittest/gunit/vector_distance-t.cc new file mode 100644 index 000000000000..0c81be1502ff --- /dev/null +++ b/unittest/gunit/vector_distance-t.cc @@ -0,0 +1,616 @@ +/* Copyright (c) 2025, Percona and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "my_pointer_arithmetic.h" +#include "vector-common/vector_distance.h" + +namespace vector_distance_unittest { + +// --------------------------------------------------------------------------- +// Reference implementations — always scalar, double precision. +// Used to verify SIMD paths; independent of vector_distance.cc internals. +// --------------------------------------------------------------------------- + +static double ref_euclidean_squared(const float *a, const float *b, + uint32_t n) { + double sum = 0.0; + for (uint32_t i = 0; i < n; i++) { + const double d = a[i] - b[i]; + sum += d * d; + } + return sum; +} + +static double ref_euclidean(const float *a, const float *b, uint32_t n) { + return std::sqrt(ref_euclidean_squared(a, b, n)); +} + +// Smallest positive offset from base that is not alignof(float)-aligned. +// A fixed +1 byte offset is not reliable: when the buffer sits at address +// % alignof(float) == alignof(float) - 1 (seen on Apple Silicon stacks), +// base+1 is still float-aligned. +static size_t misaligned_float_offset(uintptr_t base) { + for (size_t offset = 1; offset < alignof(float); ++offset) { + if ((base + offset) % alignof(float) != 0) return offset; + } + assert(false); + return 1; +} + +// Mirrors Item_func_vector_distance EUCLIDEAN branch. +static double euclidean_l2(const char *a, const char *b, uint32_t dims) { + return std::sqrt(vector_distance_euclidean_squared(a, b, dims)); +} + +static double ref_dot_product(const float *a, const float *b, uint32_t n) { + double ab = 0.0; + for (uint32_t i = 0; i < n; i++) ab += (double)a[i] * b[i]; + return ab; +} + +static double ref_cosine(const float *a, const float *b, uint32_t n) { + double ab = 0.0, na = 0.0, nb = 0.0; + for (uint32_t i = 0; i < n; i++) { + ab += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + const double denom = std::sqrt(na * nb); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +static double ref_manhattan(const float *a, const float *b, uint32_t n) { + double result = 0.0; + for (uint32_t i = 0; i < n; i++) result += std::fabs((double)a[i] - b[i]); + return result; +} + +// --------------------------------------------------------------------------- +// Fixture — initialises dispatch pointers once per test suite +// --------------------------------------------------------------------------- + +class VectorDistanceTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { init_vector_distance_functions(); } +}; + +// --------------------------------------------------------------------------- +// Known-value correctness +// --------------------------------------------------------------------------- + +TEST_F(VectorDistanceTest, EuclideanSquaredKnownValues) { + // [0,0] → [3,4] = 25.0 (squared 3-4-5 right triangle) + alignas(32) float a[] = {0.0f, 0.0f}; + alignas(32) float b[] = {3.0f, 4.0f}; + EXPECT_NEAR( + vector_distance_euclidean_squared((const char *)a, (const char *)b, 2), + 25.0, 1e-6); + + // Identical vectors → distance 0 + alignas(32) float c[] = {1.0f, 2.0f, 3.0f}; + EXPECT_NEAR( + vector_distance_euclidean_squared((const char *)c, (const char *)c, 3), + 0.0, 1e-9); +} + +TEST_F(VectorDistanceTest, EuclideanKnownValues) { + // [0,0] → [3,4] = 5.0 (3-4-5 right triangle) + alignas(32) float a[] = {0.0f, 0.0f}; + alignas(32) float b[] = {3.0f, 4.0f}; + EXPECT_NEAR(euclidean_l2((const char *)a, (const char *)b, 2), 5.0, 1e-6); + + // Identical vectors → distance 0 + alignas(32) float c[] = {1.0f, 2.0f, 3.0f}; + EXPECT_NEAR(euclidean_l2((const char *)c, (const char *)c, 3), 0.0, 1e-9); +} + +TEST_F(VectorDistanceTest, CosineKnownValues) { + // Identical unit vectors → distance 0 + alignas(32) float same[] = {1.0f, 0.0f, 0.0f}; + EXPECT_NEAR(vector_distance_cosine((const char *)same, (const char *)same, 3), + 0.0, 1e-6); + + // Orthogonal vectors → distance 1 + alignas(32) float x[] = {1.0f, 0.0f}; + alignas(32) float y[] = {0.0f, 1.0f}; + EXPECT_NEAR(vector_distance_cosine((const char *)x, (const char *)y, 2), 1.0, + 1e-6); + + // Anti-parallel → distance 2 + alignas(32) float pos[] = {1.0f, 1.0f}; + alignas(32) float neg[] = {-1.0f, -1.0f}; + EXPECT_NEAR(vector_distance_cosine((const char *)pos, (const char *)neg, 2), + 2.0, 1e-6); +} + +TEST_F(VectorDistanceTest, DotProductKnownValues) { + // Orthogonal vectors → dot product 0 + alignas(32) float x[] = {1.0f, 0.0f}; + alignas(32) float y[] = {0.0f, 1.0f}; + EXPECT_NEAR(vector_distance_dot((const char *)x, (const char *)y, 2), 0.0, + 1e-9); + + // Identical unit vector → dot product 1 + alignas(32) float u[] = {1.0f, 0.0f, 0.0f}; + EXPECT_NEAR(vector_distance_dot((const char *)u, (const char *)u, 3), 1.0, + 1e-9); + + // Known values: [1,2,3]·[4,5,6] = 4+10+18 = 32 + alignas(32) float a[] = {1.0f, 2.0f, 3.0f}; + alignas(32) float b[] = {4.0f, 5.0f, 6.0f}; + EXPECT_NEAR(vector_distance_dot((const char *)a, (const char *)b, 3), 32.0, + 1e-6); +} + +TEST_F(VectorDistanceTest, ManhattanKnownValues) { + // Identical vectors → distance 0 + alignas(32) float same[] = {1.0f, 2.0f, 3.0f}; + EXPECT_NEAR( + vector_distance_manhattan((const char *)same, (const char *)same, 3), 0.0, + 1e-9); + + // [0,0] → [3,4] = |3| + |4| = 7 (compare: Euclidean gives 5) + alignas(32) float a[] = {0.0f, 0.0f}; + alignas(32) float b[] = {3.0f, 4.0f}; + EXPECT_NEAR(vector_distance_manhattan((const char *)a, (const char *)b, 2), + 7.0, 1e-6); + + // [1,7,3,16,5] → [1,2,3,4,5] = 0+5+0+12+0 = 17 + alignas(32) float c[] = {1.0f, 7.0f, 3.0f, 16.0f, 5.0f}; + alignas(32) float d[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + EXPECT_NEAR(vector_distance_manhattan((const char *)c, (const char *)d, 5), + 17.0, 1e-6); +} + +// --------------------------------------------------------------------------- +// Zero-vector guard: cosine must return +Inf sentinel (val_real maps to NULL) +// --------------------------------------------------------------------------- + +TEST_F(VectorDistanceTest, CosineZeroVectorReturnsInf) { + alignas(32) float z[] = {0.0f, 0.0f}; + alignas(32) float a[] = {1.0f, 2.0f}; + EXPECT_TRUE( + std::isinf(vector_distance_cosine((const char *)z, (const char *)a, 2))); + EXPECT_TRUE( + std::isinf(vector_distance_cosine((const char *)a, (const char *)z, 2))); + EXPECT_TRUE( + std::isinf(vector_distance_cosine((const char *)z, (const char *)z, 2))); +} + +// --------------------------------------------------------------------------- +// Unaligned path: misaligned buffer must give the same result as aligned +// --------------------------------------------------------------------------- + +TEST_F(VectorDistanceTest, UnalignedMatchesAligned) { + constexpr uint32_t dims = 8; + alignas(32) float fa[dims] = {1, 2, 3, 4, 5, 6, 7, 8}; + alignas(32) float fb[dims] = {8, 7, 6, 5, 4, 3, 2, 1}; + + char buf_a[dims * sizeof(float) + alignof(float)]; + char buf_b[dims * sizeof(float) + alignof(float)]; + const size_t off_a = + misaligned_float_offset(reinterpret_cast(buf_a)); + const size_t off_b = + misaligned_float_offset(reinterpret_cast(buf_b)); + std::memcpy(buf_a + off_a, fa, dims * sizeof(float)); + std::memcpy(buf_b + off_b, fb, dims * sizeof(float)); + + const char *ma = buf_a + off_a; + const char *mb = buf_b + off_b; + ASSERT_FALSE(is_aligned_to(ma, alignof(float))); + ASSERT_FALSE(is_aligned_to(mb, alignof(float))); + + EXPECT_DOUBLE_EQ(vector_distance_euclidean_squared((const char *)fa, + (const char *)fb, dims), + vector_distance_euclidean_squared(ma, mb, dims)); + EXPECT_DOUBLE_EQ(euclidean_l2((const char *)fa, (const char *)fb, dims), + euclidean_l2(ma, mb, dims)); + EXPECT_DOUBLE_EQ( + vector_distance_cosine((const char *)fa, (const char *)fb, dims), + vector_distance_cosine(ma, mb, dims)); + EXPECT_DOUBLE_EQ( + vector_distance_dot((const char *)fa, (const char *)fb, dims), + vector_distance_dot(ma, mb, dims)); + EXPECT_DOUBLE_EQ( + vector_distance_manhattan((const char *)fa, (const char *)fb, dims), + vector_distance_manhattan(ma, mb, dims)); +} + +// --------------------------------------------------------------------------- +// SIMD parity: aligned result matches double-precision scalar reference. +// On CPUs without SIMD both sides run the same scalar code, so the test +// degenerates into an identity check — still a useful correctness signal. +// --------------------------------------------------------------------------- + +TEST_F(VectorDistanceTest, EuclideanSquaredParityWithReference) { + std::mt19937 rng(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 16u, 32u, 128u, 512u}) { + // Use heap vectors; malloc guarantees at least 16-byte alignment, + // which satisfies our alignof(float)=4 dispatch gate. + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_euclidean_squared( + (const char *)a.data(), (const char *)b.data(), dims); + const double ref = ref_euclidean_squared(a.data(), b.data(), dims); + // Allow 0.01% relative tolerance for float-precision SIMD accumulation. + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) << "dims=" << dims; + } +} + +TEST_F(VectorDistanceTest, EuclideanParityWithReference) { + std::mt19937 rng(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 16u, 32u, 128u, 512u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = + euclidean_l2((const char *)a.data(), (const char *)b.data(), dims); + const double ref = ref_euclidean(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) << "dims=" << dims; + } +} + +TEST_F(VectorDistanceTest, CosineParityWithReference) { + std::mt19937 rng(123); + std::uniform_real_distribution dist(-5.0f, 5.0f); + + for (uint32_t dims : {4u, 8u, 16u, 32u, 128u, 512u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_cosine((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_cosine(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, 1e-4) << "dims=" << dims; + } +} + +TEST_F(VectorDistanceTest, DotProductParityWithReference) { + std::mt19937 rng(77); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 16u, 32u, 128u, 512u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_dot((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_dot_product(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, std::abs(ref) * 1e-4 + 1e-9) << "dims=" << dims; + } +} + +TEST_F(VectorDistanceTest, ManhattanParityWithReference) { + std::mt19937 rng(55); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 16u, 32u, 128u, 512u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_manhattan((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_manhattan(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) << "dims=" << dims; + } +} + +// --------------------------------------------------------------------------- +// Per-tier parity tests +// +// A separate parameterized fixture calls init_vector_distance_functions_tier() +// for each registered tier, skipping tiers that are unavailable on this CPU or +// build. This ensures every SIMD kernel is tested for correctness independently +// — including inferior tiers on CPUs that support a higher one. +// +// The existing VectorDistanceTest suite is untouched; it still exercises the +// production path via init_vector_distance_functions() (highest tier on this +// CPU). +// --------------------------------------------------------------------------- + +static const char *tier_name(VectorDistanceTier tier) { + switch (tier) { + case VectorDistanceTier::Scalar: + return "Scalar"; + case VectorDistanceTier::Sse42: + return "Sse42"; + case VectorDistanceTier::Avx2: + return "Avx2"; + case VectorDistanceTier::Avx512f: + return "Avx512f"; + case VectorDistanceTier::Neon: + return "Neon"; + case VectorDistanceTier::Sve2: + return "Sve2"; + } + return "Unknown"; +} + +class VectorDistanceTierParityTest + : public ::testing::TestWithParam { + protected: + void SetUp() override { + const VectorDistanceTier t = GetParam(); + if (!vector_distance_tier_available(t)) + GTEST_SKIP() << tier_name(t) << " not available on this CPU/build"; + init_vector_distance_functions_tier(t); +#if defined(__x86_64__) || defined(_M_X64) + if (t == VectorDistanceTier::Avx2 || t == VectorDistanceTier::Avx512f) { + EXPECT_EQ(vector_distance_wide_tier(), t); + EXPECT_EQ(vector_distance_narrow_tier(), + vector_distance_tier_available(VectorDistanceTier::Sse42) + ? VectorDistanceTier::Sse42 + : VectorDistanceTier::Scalar); + } +#endif + } + void TearDown() override { init_vector_distance_functions(); } +}; + +TEST_P(VectorDistanceTierParityTest, EuclideanSquaredParityPerTier) { + std::mt19937 rng(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 32u, 128u, 1024u, 16383u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_euclidean_squared( + (const char *)a.data(), (const char *)b.data(), dims); + const double ref = ref_euclidean_squared(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) + << "tier=" << tier_name(GetParam()) << " dims=" << dims; + } +} + +TEST_P(VectorDistanceTierParityTest, EuclideanParityPerTier) { + std::mt19937 rng(42); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 32u, 128u, 1024u, 16383u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = + euclidean_l2((const char *)a.data(), (const char *)b.data(), dims); + const double ref = ref_euclidean(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) + << "tier=" << tier_name(GetParam()) << " dims=" << dims; + } +} + +TEST_P(VectorDistanceTierParityTest, CosineParityPerTier) { + std::mt19937 rng(123); + std::uniform_real_distribution dist(-5.0f, 5.0f); + + for (uint32_t dims : {4u, 8u, 32u, 128u, 1024u, 16383u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_cosine((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_cosine(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, 1e-4) + << "tier=" << tier_name(GetParam()) << " dims=" << dims; + } +} + +TEST_P(VectorDistanceTierParityTest, DotProductParityPerTier) { + std::mt19937 rng(77); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 32u, 128u, 1024u, 16383u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_dot((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_dot_product(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, std::abs(ref) * 1e-4 + 1e-9) + << "tier=" << tier_name(GetParam()) << " dims=" << dims; + } +} + +TEST_P(VectorDistanceTierParityTest, ManhattanParityPerTier) { + std::mt19937 rng(55); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (uint32_t dims : {4u, 8u, 32u, 128u, 1024u, 16383u}) { + std::vector a(dims), b(dims); + for (auto &x : a) x = dist(rng); + for (auto &x : b) x = dist(rng); + + const double got = vector_distance_manhattan((const char *)a.data(), + (const char *)b.data(), dims); + const double ref = ref_manhattan(a.data(), b.data(), dims); + EXPECT_NEAR(got, ref, ref * 1e-4 + 1e-9) + << "tier=" << tier_name(GetParam()) << " dims=" << dims; + } +} + +TEST_P(VectorDistanceTierParityTest, OverflowFallbackPerTier) { + // A 16-dim vector (>= 16 triggers the wide kernel) with one element = 2e38. + // (2e38)^2 ~ 4e76 overflows float32 (FLT_MAX ~ 3.4e38); the SIMD accumulator + // becomes +Inf without the fallback. The scalar path uses double throughout + // and returns a finite result. Verify the fix: result must be finite and + // equal to the scalar reference. + constexpr uint32_t dims = 16; + std::vector a(dims, 0.0f), b(dims, 0.0f); + a[0] = 2e38f; + + // Euclidean squared: scalar = (2e38)^2; broken SIMD would give +Inf (-> SQL + // NULL). + const double got_e = vector_distance_euclidean_squared( + (const char *)a.data(), (const char *)b.data(), dims); + EXPECT_TRUE(std::isfinite(got_e)) << "tier=" << tier_name(GetParam()); + EXPECT_EQ(got_e, ref_euclidean_squared(a.data(), b.data(), dims)) + << "tier=" << tier_name(GetParam()); + + // Euclidean L2: scalar = 2e38; mirrors SQL EUCLIDEAN (sqrt of squared). + const double got_l2 = + euclidean_l2((const char *)a.data(), (const char *)b.data(), dims); + EXPECT_TRUE(std::isfinite(got_l2)) << "tier=" << tier_name(GetParam()); + EXPECT_EQ(got_l2, ref_euclidean(a.data(), b.data(), dims)) + << "tier=" << tier_name(GetParam()); + + // Manhattan: scalar = 2e38. + const double got_m = vector_distance_manhattan((const char *)a.data(), + (const char *)b.data(), dims); + EXPECT_TRUE(std::isfinite(got_m)) << "tier=" << tier_name(GetParam()); + EXPECT_EQ(got_m, ref_manhattan(a.data(), b.data(), dims)) + << "tier=" << tier_name(GetParam()); + + // Dot: a[0]*b2[0] = 2e38*2e38 overflows float32; scalar = 4e76 (finite in + // double). + std::vector b2(dims, 0.0f); + b2[0] = 2e38f; + const double got_d = vector_distance_dot((const char *)a.data(), + (const char *)b2.data(), dims); + EXPECT_TRUE(std::isfinite(got_d)) << "tier=" << tier_name(GetParam()); + EXPECT_EQ(got_d, ref_dot_product(a.data(), b2.data(), dims)) + << "tier=" << tier_name(GetParam()); + + // Cosine: a == a (same pointer) => cosine distance = 0. + // Float32 norm overflow to Inf causes 1 - Inf/Inf = NaN without the fix. + const double got_c = vector_distance_cosine((const char *)a.data(), + (const char *)a.data(), dims); + EXPECT_NEAR(got_c, 0.0, 1e-9) << "tier=" << tier_name(GetParam()); +} + +static std::string tier_param_name( + const ::testing::TestParamInfo &info) { + return tier_name(info.param); +} + +#if defined(__x86_64__) || defined(_M_X64) +INSTANTIATE_TEST_SUITE_P(AllTiers, VectorDistanceTierParityTest, + ::testing::Values(VectorDistanceTier::Scalar, + VectorDistanceTier::Sse42, + VectorDistanceTier::Avx2, + VectorDistanceTier::Avx512f), + tier_param_name); +#elif defined(__aarch64__) || defined(_M_ARM64) +INSTANTIATE_TEST_SUITE_P(AllTiers, VectorDistanceTierParityTest, + ::testing::Values(VectorDistanceTier::Scalar, + VectorDistanceTier::Neon, + VectorDistanceTier::Sve2), + tier_param_name); +#else +INSTANTIATE_TEST_SUITE_P(AllTiers, VectorDistanceTierParityTest, + ::testing::Values(VectorDistanceTier::Scalar), + tier_param_name); +#endif + +// --------------------------------------------------------------------------- +// Dispatch description / tier reporting +// --------------------------------------------------------------------------- + +class VectorDistanceDispatchTest : public ::testing::Test { + protected: + void TearDown() override { init_vector_distance_functions(); } +}; + +TEST_F(VectorDistanceDispatchTest, ProductionInitIsIdempotent) { + init_vector_distance_functions(); + const auto wide = vector_distance_wide_tier(); + const auto narrow = vector_distance_narrow_tier(); + init_vector_distance_functions(); + EXPECT_EQ(vector_distance_wide_tier(), wide); + EXPECT_EQ(vector_distance_narrow_tier(), narrow); +} + +TEST_F(VectorDistanceDispatchTest, ProductionInitRestoredAfterScalarOverride) { + init_vector_distance_functions(); + const VectorDistanceTier prod_wide = vector_distance_wide_tier(); + const VectorDistanceTier prod_narrow = vector_distance_narrow_tier(); + + init_vector_distance_functions_tier(VectorDistanceTier::Scalar); + EXPECT_EQ(vector_distance_wide_tier(), VectorDistanceTier::Scalar); + EXPECT_EQ(vector_distance_narrow_tier(), VectorDistanceTier::Scalar); + + init_vector_distance_functions(); + EXPECT_EQ(vector_distance_wide_tier(), prod_wide); + EXPECT_EQ(vector_distance_narrow_tier(), prod_narrow); +} + +TEST_F(VectorDistanceDispatchTest, DescriptionContainsUsingPrefix) { + init_vector_distance_functions(); + char msg[256]; + const size_t len = vector_distance_dispatch_description(msg, sizeof(msg)); + EXPECT_GT(len, 0u); + EXPECT_EQ(strncmp(msg, "Using", 5), 0); + EXPECT_NE(std::string(msg).find("DISTANCE()"), std::string::npos); +} + +TEST_F(VectorDistanceDispatchTest, ForcedScalarTierUpdatesDescription) { + init_vector_distance_functions_tier(VectorDistanceTier::Scalar); + EXPECT_EQ(vector_distance_wide_tier(), VectorDistanceTier::Scalar); + EXPECT_EQ(vector_distance_narrow_tier(), VectorDistanceTier::Scalar); + + char msg[256]; + vector_distance_dispatch_description(msg, sizeof(msg)); + EXPECT_NE(std::string(msg).find("software scalar"), std::string::npos); +} + +#if defined(__x86_64__) || defined(_M_X64) +TEST_F(VectorDistanceDispatchTest, + SplitDispatchMentionsDimensionsWhenWideDiffers) { + if (!vector_distance_tier_available(VectorDistanceTier::Avx2) || + !vector_distance_tier_available(VectorDistanceTier::Sse42)) { + GTEST_SKIP() << "requires AVX2 wide path and SSE4.2 narrow path"; + } + + init_vector_distance_functions(); + ASSERT_NE(vector_distance_wide_tier(), vector_distance_narrow_tier()); + + char msg[256]; + vector_distance_dispatch_description(msg, sizeof(msg)); + const std::string ge = + "dimensions >= " + std::to_string(VECTOR_DISTANCE_WIDE_MIN_DIMS); + const std::string lt = + "dimensions < " + std::to_string(VECTOR_DISTANCE_WIDE_MIN_DIMS); + EXPECT_NE(std::string(msg).find(ge), std::string::npos); + EXPECT_NE(std::string(msg).find(lt), std::string::npos); + EXPECT_NE(std::string(msg).find("SSE4.2"), std::string::npos); +} +#endif + +} // namespace vector_distance_unittest diff --git a/unittest/gunit/vector_distance_benchmark-t.cc b/unittest/gunit/vector_distance_benchmark-t.cc new file mode 100644 index 000000000000..c6c4c4cc3111 --- /dev/null +++ b/unittest/gunit/vector_distance_benchmark-t.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2025, Percona and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/** + @file vector_distance_benchmark-t.cc + + Per-tier microbenchmarks for vector_distance_euclidean_squared(), + vector_distance_cosine(), vector_distance_dot(), and + vector_distance_manhattan(). + + Euclidean (L2) benchmarks apply std::sqrt() on top of the squared kernel, + mirroring the SQL EUCLIDEAN path in Item_func_vector_distance::val_real. + EuclideanSquared benchmarks measure the kernel alone (SQL EUCLIDEAN_SQUARED). + + Every combination of (metric × tier × size) is registered as a separate + Google Test case via the BENCHMARK() macro. At the start of each case, + vector_distance_tier_available() is checked and GTEST_SKIP() is called when + the tier is not supported by the current CPU or build. This means: + + - On a Scalar-only x86 host: only Scalar tests execute; Sse42/Avx2/Avx512f + are skipped. + - On an AVX-512 capable host: all four x86 tiers run, so inferior-tier + throughput is also measured. + - On aarch64 without a SVE2 build: Neon runs, Sve2 is skipped. + + Sizes benchmarked: 4, 8, 32, 128, 1024, 16383 (float32 elements). + Metrics: Euclidean, EuclideanSquared, Cosine, DotProduct, Manhattan. + + Tiers registered per platform: + x86_64 — Scalar, Sse42, Avx2, Avx512f + aarch64 — Scalar, Neon, Sve2 + other — Scalar only +*/ + +#include + +#include +#include +#include +#include + +#include "unittest/gunit/benchmark.h" +#include "vector-common/vector_distance.h" + +namespace vector_distance_tier_bench { + +// Volatile sink prevents the optimizer from discarding computed distances. +static volatile double bench_sink; + +enum class Metric { + Euclidean, + EuclideanSquared, + Cosine, + DotProduct, + Manhattan +}; + +static void fill_random(float *data, uint32_t n, uint32_t seed) { + std::mt19937 rng(seed); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (uint32_t i = 0; i < n; i++) data[i] = dist(rng); +} + +// --------------------------------------------------------------------------- +// Generic benchmark body +// --------------------------------------------------------------------------- + +template +static void bench_impl(size_t num_iterations) { +#ifndef NDEBUG + // benchmark.cc calls StartBenchmarkTiming() before invoking func(), which + // conflicts with our inner StartBenchmarkTiming() and triggers the + // assert(!timer_running) in debug builds. Timings are meaningless in debug + // mode regardless; skip cleanly instead. + GTEST_SKIP() << "Benchmarks skipped in debug builds " + "(build with -DWITH_DEBUG=OFF for meaningful results)"; +#endif + if (!vector_distance_tier_available(kTier)) { + const char *names[] = {"Scalar", "Sse42", "Avx2", + "Avx512f", "Neon", "Sve2"}; + const int idx = static_cast(kTier); + GTEST_SKIP() << names[idx] << " not available on this CPU/build"; + } + + init_vector_distance_functions_tier(kTier); + + std::vector a(kDims), b(kDims); + fill_random(a.data(), kDims, 1); + fill_random(b.data(), kDims, 2); + + StartBenchmarkTiming(); + for (size_t i = 0; i < num_iterations; i++) { + if constexpr (kMetric == Metric::Cosine) + bench_sink = vector_distance_cosine((const char *)a.data(), + (const char *)b.data(), kDims); + else if constexpr (kMetric == Metric::DotProduct) + bench_sink = vector_distance_dot((const char *)a.data(), + (const char *)b.data(), kDims); + else if constexpr (kMetric == Metric::Manhattan) + bench_sink = vector_distance_manhattan((const char *)a.data(), + (const char *)b.data(), kDims); + else if constexpr (kMetric == Metric::Euclidean) + bench_sink = std::sqrt(vector_distance_euclidean_squared( + (const char *)a.data(), (const char *)b.data(), kDims)); + else + bench_sink = vector_distance_euclidean_squared( + (const char *)a.data(), (const char *)b.data(), kDims); + } + StopBenchmarkTiming(); + SetBytesProcessed(num_iterations * kDims * sizeof(float) * 2); +} + +// --------------------------------------------------------------------------- +// Macro machinery +// --------------------------------------------------------------------------- + +// Expands M(size) for each of the six benchmark sizes. +#define FOR_EACH_SIZE(M) M(4) M(8) M(32) M(128) M(1024) M(16383) + +// Registers one Euclidean (L2 with sqrt) benchmark for a given tier + size. +#define BENCH_EUCLIDEAN_ONE(tier_enum, tier_label, dims) \ + static void BenchEuclidean_##tier_label##_##dims(size_t n) { \ + bench_impl(n); \ + } \ + BENCHMARK(BenchEuclidean_##tier_label##_##dims) + +// Registers one EuclideanSquared benchmark for a given tier + size. +#define BENCH_EUCLIDEAN_SQUARED_ONE(tier_enum, tier_label, dims) \ + static void BenchEuclideanSquared_##tier_label##_##dims(size_t n) { \ + bench_impl( \ + n); \ + } \ + BENCHMARK(BenchEuclideanSquared_##tier_label##_##dims) + +// Registers one Cosine benchmark function for a given tier + size. +#define BENCH_COSINE_ONE(tier_enum, tier_label, dims) \ + static void BenchCosine_##tier_label##_##dims(size_t n) { \ + bench_impl(n); \ + } \ + BENCHMARK(BenchCosine_##tier_label##_##dims) + +// Per-size expanders (one per size, named so the tier macro can paste them). +#define BENCH_EUCLIDEAN_Scalar(dims) BENCH_EUCLIDEAN_ONE(Scalar, Scalar, dims) +#define BENCH_EUCLIDEAN_Sse42(dims) BENCH_EUCLIDEAN_ONE(Sse42, Sse42, dims) +#define BENCH_EUCLIDEAN_Avx2(dims) BENCH_EUCLIDEAN_ONE(Avx2, Avx2, dims) +#define BENCH_EUCLIDEAN_Avx512f(dims) \ + BENCH_EUCLIDEAN_ONE(Avx512f, Avx512f, dims) +#define BENCH_EUCLIDEAN_Neon(dims) BENCH_EUCLIDEAN_ONE(Neon, Neon, dims) +#define BENCH_EUCLIDEAN_Sve2(dims) BENCH_EUCLIDEAN_ONE(Sve2, Sve2, dims) + +#define BENCH_EUCLIDEAN_SQUARED_Scalar(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Scalar, Scalar, dims) +#define BENCH_EUCLIDEAN_SQUARED_Sse42(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Sse42, Sse42, dims) +#define BENCH_EUCLIDEAN_SQUARED_Avx2(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Avx2, Avx2, dims) +#define BENCH_EUCLIDEAN_SQUARED_Avx512f(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Avx512f, Avx512f, dims) +#define BENCH_EUCLIDEAN_SQUARED_Neon(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Neon, Neon, dims) +#define BENCH_EUCLIDEAN_SQUARED_Sve2(dims) \ + BENCH_EUCLIDEAN_SQUARED_ONE(Sve2, Sve2, dims) + +#define BENCH_COSINE_Scalar(dims) BENCH_COSINE_ONE(Scalar, Scalar, dims) +#define BENCH_COSINE_Sse42(dims) BENCH_COSINE_ONE(Sse42, Sse42, dims) +#define BENCH_COSINE_Avx2(dims) BENCH_COSINE_ONE(Avx2, Avx2, dims) +#define BENCH_COSINE_Avx512f(dims) BENCH_COSINE_ONE(Avx512f, Avx512f, dims) +#define BENCH_COSINE_Neon(dims) BENCH_COSINE_ONE(Neon, Neon, dims) +#define BENCH_COSINE_Sve2(dims) BENCH_COSINE_ONE(Sve2, Sve2, dims) + +// Registers one DotProduct benchmark function for a given tier + size. +#define BENCH_DOT_PRODUCT_ONE(tier_enum, tier_label, dims) \ + static void BenchDotProduct_##tier_label##_##dims(size_t n) { \ + bench_impl(n); \ + } \ + BENCHMARK(BenchDotProduct_##tier_label##_##dims) + +#define BENCH_DOT_PRODUCT_Scalar(dims) \ + BENCH_DOT_PRODUCT_ONE(Scalar, Scalar, dims) +#define BENCH_DOT_PRODUCT_Sse42(dims) BENCH_DOT_PRODUCT_ONE(Sse42, Sse42, dims) +#define BENCH_DOT_PRODUCT_Avx2(dims) BENCH_DOT_PRODUCT_ONE(Avx2, Avx2, dims) +#define BENCH_DOT_PRODUCT_Avx512f(dims) \ + BENCH_DOT_PRODUCT_ONE(Avx512f, Avx512f, dims) +#define BENCH_DOT_PRODUCT_Neon(dims) BENCH_DOT_PRODUCT_ONE(Neon, Neon, dims) +#define BENCH_DOT_PRODUCT_Sve2(dims) BENCH_DOT_PRODUCT_ONE(Sve2, Sve2, dims) + +// Registers one Manhattan benchmark function for a given tier + size. +#define BENCH_MANHATTAN_ONE(tier_enum, tier_label, dims) \ + static void BenchManhattan_##tier_label##_##dims(size_t n) { \ + bench_impl(n); \ + } \ + BENCHMARK(BenchManhattan_##tier_label##_##dims) + +#define BENCH_MANHATTAN_Scalar(dims) BENCH_MANHATTAN_ONE(Scalar, Scalar, dims) +#define BENCH_MANHATTAN_Sse42(dims) BENCH_MANHATTAN_ONE(Sse42, Sse42, dims) +#define BENCH_MANHATTAN_Avx2(dims) BENCH_MANHATTAN_ONE(Avx2, Avx2, dims) +#define BENCH_MANHATTAN_Avx512f(dims) \ + BENCH_MANHATTAN_ONE(Avx512f, Avx512f, dims) +#define BENCH_MANHATTAN_Neon(dims) BENCH_MANHATTAN_ONE(Neon, Neon, dims) +#define BENCH_MANHATTAN_Sve2(dims) BENCH_MANHATTAN_ONE(Sve2, Sve2, dims) + +// --------------------------------------------------------------------------- +// Tier registrations — all tiers are always declared; unavailable ones skip. +// --------------------------------------------------------------------------- + +// Tier 0 — Scalar (all platforms) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Scalar) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Scalar) +FOR_EACH_SIZE(BENCH_COSINE_Scalar) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Scalar) +FOR_EACH_SIZE(BENCH_MANHATTAN_Scalar) + +// x86_64 tiers +#if defined(__x86_64__) || defined(_M_X64) + +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Sse42) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Sse42) +FOR_EACH_SIZE(BENCH_COSINE_Sse42) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Sse42) +FOR_EACH_SIZE(BENCH_MANHATTAN_Sse42) + +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Avx2) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Avx2) +FOR_EACH_SIZE(BENCH_COSINE_Avx2) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Avx2) +FOR_EACH_SIZE(BENCH_MANHATTAN_Avx2) + +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Avx512f) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Avx512f) +FOR_EACH_SIZE(BENCH_COSINE_Avx512f) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Avx512f) +FOR_EACH_SIZE(BENCH_MANHATTAN_Avx512f) + +#endif // x86_64 + +// aarch64 tiers +#if defined(__aarch64__) || defined(_M_ARM64) + +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Neon) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Neon) +FOR_EACH_SIZE(BENCH_COSINE_Neon) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Neon) +FOR_EACH_SIZE(BENCH_MANHATTAN_Neon) + +FOR_EACH_SIZE(BENCH_EUCLIDEAN_Sve2) +FOR_EACH_SIZE(BENCH_EUCLIDEAN_SQUARED_Sve2) +FOR_EACH_SIZE(BENCH_COSINE_Sve2) +FOR_EACH_SIZE(BENCH_DOT_PRODUCT_Sve2) +FOR_EACH_SIZE(BENCH_MANHATTAN_Sve2) + +#endif // aarch64 + +// --------------------------------------------------------------------------- +// Cleanup macros +// --------------------------------------------------------------------------- + +#undef BENCH_MANHATTAN_Sve2 +#undef BENCH_MANHATTAN_Neon +#undef BENCH_MANHATTAN_Avx512f +#undef BENCH_MANHATTAN_Avx2 +#undef BENCH_MANHATTAN_Sse42 +#undef BENCH_MANHATTAN_Scalar +#undef BENCH_MANHATTAN_ONE +#undef BENCH_DOT_PRODUCT_Sve2 +#undef BENCH_DOT_PRODUCT_Neon +#undef BENCH_DOT_PRODUCT_Avx512f +#undef BENCH_DOT_PRODUCT_Avx2 +#undef BENCH_DOT_PRODUCT_Sse42 +#undef BENCH_DOT_PRODUCT_Scalar +#undef BENCH_DOT_PRODUCT_ONE +#undef BENCH_COSINE_Sve2 +#undef BENCH_COSINE_Neon +#undef BENCH_COSINE_Avx512f +#undef BENCH_COSINE_Avx2 +#undef BENCH_COSINE_Sse42 +#undef BENCH_COSINE_Scalar +#undef BENCH_EUCLIDEAN_SQUARED_Sve2 +#undef BENCH_EUCLIDEAN_SQUARED_Neon +#undef BENCH_EUCLIDEAN_SQUARED_Avx512f +#undef BENCH_EUCLIDEAN_SQUARED_Avx2 +#undef BENCH_EUCLIDEAN_SQUARED_Sse42 +#undef BENCH_EUCLIDEAN_SQUARED_Scalar +#undef BENCH_EUCLIDEAN_SQUARED_ONE +#undef BENCH_EUCLIDEAN_Sve2 +#undef BENCH_EUCLIDEAN_Neon +#undef BENCH_EUCLIDEAN_Avx512f +#undef BENCH_EUCLIDEAN_Avx2 +#undef BENCH_EUCLIDEAN_Sse42 +#undef BENCH_EUCLIDEAN_Scalar +#undef BENCH_COSINE_ONE +#undef BENCH_EUCLIDEAN_ONE +#undef FOR_EACH_SIZE + +} // namespace vector_distance_tier_bench diff --git a/vector-common/vector_distance.cc b/vector-common/vector_distance.cc new file mode 100644 index 000000000000..f5f9c78f8bad --- /dev/null +++ b/vector-common/vector_distance.cc @@ -0,0 +1,1120 @@ +/* Copyright (c) 2025, Percona and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/** + @file vector-common/vector_distance.cc + + Euclidean, cosine and dot-product distance for VECTOR columns (float32). + + Public entry points — vector_distance_euclidean_squared(), + vector_distance_cosine(), vector_distance_dot(), vector_distance_manhattan() + — are declared in + vector-common/vector_distance.h. Call init_vector_distance_functions() once + before first use so the dispatch pointers are set to the best kernel for the + host CPU. + + All kernels accept const char * and any byte alignment. SIMD paths use + unaligned load intrinsics (_mm_loadu_ps, _mm256_loadu_ps, _mm512_loadu_ps, + vld1q_f32, svld1_f32) rather than aligned variants: VECTOR payloads may + come from columns or misaligned SUBSTR blobs, and on modern x86/ARM CPUs + unaligned loads have the same throughput as aligned loads when the data + happens to be aligned. Aligned intrinsics would fault on misaligned + addresses without improving performance. The one remaining slowdown is + cache-line crossing: if a load spans a 64-byte cache-line boundary the + CPU must fetch from two lines, which costs extra regardless of whether + the instruction is MOVUPS or MOVAPS — that penalty depends on runtime + address, not on choosing loadu vs load. Scalar tails use memcpy to + avoid UB on misaligned float *. + + Dim-aware dispatch (Opt 2) + -------------------------- + Wide-tier kernels (AVX2, AVX-512, SVE2) have a minimum useful dimension: the + SIMD body only fires when dims ≥ register_width (8 for AVX2, 16 for AVX-512). + Below that threshold, calling a wide-tier kernel pays register-init overhead + with no SIMD benefit. + + To avoid this regression, the public wrappers use two function pointer sets: + g_* — the widest available tier; used for dims ≥ + VECTOR_DISTANCE_WIDE_MIN_DIMS g_*_narrow — SSE4.2 / NEON (fills at dim ≥ 4); + used for dims < VECTOR_DISTANCE_WIDE_MIN_DIMS + + SIMD tier model + --------------- + Kernels are grouped into four tiers. init_vector_distance_functions() selects + the highest tier the CPU and OS support at runtime. Each SIMD function is + compiled with its own GCC/Clang target attribute so this translation unit + stays at the baseline ISA; no global -mavx2 / -march=native is required. + + +------+----------+--------------------------------+-------------------+ + | Tier | Name | Optimization target | Register width | + +------+----------+--------------------------------+-------------------+ + | 0 | Scalar | Any x86_64 / ARM64 (fallback) | 32/64-bit | + | 1 | Legacy | SSE4.2 (Intel/AMD) / NEON (ARM)| 128-bit | + | 2 | Standard | AVX2 + FMA (Intel/AMD) | 256-bit | + | 3 | Ultra | AVX-512 (Intel/AMD) / SVE2(ARM)| 512-bit+ (VLA) | + +------+----------+--------------------------------+-------------------+ + + Tier 0 — Scalar (all architectures) + euclidean_scalar, cosine_scalar, dot_product_scalar, manhattan_scalar + Default function pointers; also forced by + init_vector_distance_functions_tier(VectorDistanceTier::Scalar). + + Tier 1 — Legacy + x86_64: euclidean_sse, cosine_sse, dot_product_sse, manhattan_sse + (target("sse4.2"), 4 floats/iter) cpu_has_sse42() + aarch64: euclidean_neon, cosine_neon, dot_product_neon, manhattan_neon + (target("+simd"), 4 floats/iter) Always enabled on ARMv8. + + Tier 2 — Standard (x86_64 only) + euclidean_avx2, cosine_avx2, dot_product_avx2, manhattan_avx2 + (target("avx2"), 8 floats/iter) cpu_has_avx2_fma() + + Tier 3 — Ultra + x86_64: euclidean_avx512, cosine_avx512, dot_product_avx512, + manhattan_avx512 (target("avx512f"), 16 floats/iter) cpu_has_avx512f() + aarch64: euclidean_sve2, cosine_sve2, dot_product_sve2, manhattan_sve2 + (target("+sve2"), scalable VLA) + + Runtime dispatch (init_vector_distance_functions) + ------------------------------------ + x86_64: AVX-512 -> AVX2+FMA -> SSE4.2 -> scalar (wide) + SSE4.2 -> scalar (narrow, dims < VECTOR_DISTANCE_WIDE_MIN_DIMS) + aarch64: NEON or SVE2 (wide); NEON (narrow) + other: scalar only (no-op init) + _WIN32 (x64 and ARM64): scalar only — SIMD tiers compiled out at build time. + + Euclidean kernels (euclidean_*) return sum((a[i]-b[i])²) without sqrt. + SQL applies std::sqrt for the EUCLIDEAN metric; EUCLIDEAN_SQUARED uses the + kernel result directly. + + Cosine distance returns quiet NaN when either vector has zero norm; the SQL + layer (Item_func_vector_distance::val_real) maps that to NULL. +*/ + +#include "vector-common/vector_distance.h" + +#include +#include +#include +#include +#include + +#include "mysql/attribute.h" // MY_ATTRIBUTE + +// Platform guards — mirror ut0crc32.h:53-69 +// +// On _WIN32 (x64 and ARM64) SIMD tiers are not wired up; scalar-only, +// same pragmatic approach as CRC32_DEFAULT in ut0crc32.h. +#if !defined(_WIN32) +#if defined(__x86_64__) || defined(_M_X64) +#define VECTOR_DISTANCE_x86_64 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define VECTOR_DISTANCE_AARCH64 +#endif +#endif + +#if !defined(VECTOR_DISTANCE_x86_64) && !defined(VECTOR_DISTANCE_AARCH64) +#define VECTOR_DISTANCE_DEFAULT +#endif + +// SVE2 is opt-in: it requires the toolchain to compile the unit with +// __ARM_FEATURE_SVE2 (e.g. -march=armv8-a+sve2 or armv9-a). Without that we +// keep NEON-only behaviour and avoid pulling in . +#if defined(VECTOR_DISTANCE_AARCH64) && defined(__ARM_FEATURE_SVE2) +#define VECTOR_DISTANCE_HAS_SVE2 +#endif + +// --------------------------------------------------------------------------- +// Scalar kernels — always compiled, safe on every architecture. +// Take const char * so they can handle any byte alignment via memcpy. +// Scalar path accumulates in double. Inputs are float32, but double +// avoids overflow/precision loss (e.g. (2e38)² is Inf in float, finite +// in double) and matches the SIMD reduction path below. +// --------------------------------------------------------------------------- + +static double euclidean_scalar(const char *a_raw, const char *b_raw, + uint32_t dims) { + double result = 0.0; + for (uint32_t i = 0; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = av - bv; + result += d * d; + } + return result; +} + +static double cosine_scalar(const char *a_raw, const char *b_raw, + uint32_t dims) { + double ab = 0.0, norm_a = 0.0, norm_b = 0.0; + for (uint32_t i = 0; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + norm_a += (double)av * av; + norm_b += (double)bv * bv; + } + const double denom = sqrt(norm_a * norm_b); + // +Inf sentinel: zero-denom means undefined cosine (zero-vector input). + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +static double dot_product_scalar(const char *a_raw, const char *b_raw, + uint32_t dims) { + double ab = 0.0; + for (uint32_t i = 0; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + } + return ab; +} + +static double manhattan_scalar(const char *a_raw, const char *b_raw, + uint32_t dims) { + double result = 0.0; + for (uint32_t i = 0; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = (double)av - bv; + result += std::fabs(d); + } + return result; +} + +// --------------------------------------------------------------------------- +// Function pointers — initialized to scalar; init_vector_distance_functions() +// may promote. Wide pointers (g_*) are used for dims ≥ +// VECTOR_DISTANCE_WIDE_MIN_DIMS. Narrow pointers (g_*_narrow) are used for dims +// < VECTOR_DISTANCE_WIDE_MIN_DIMS to avoid dispatching to AVX-512/AVX2 when the +// SIMD loop cannot fire (needs ≥ 16/8 elements). +// --------------------------------------------------------------------------- + +using vector_distance_fn_t = double (*)(const char *, const char *, uint32_t); + +static vector_distance_fn_t g_euclidean = euclidean_scalar; +static vector_distance_fn_t g_cosine = cosine_scalar; +static vector_distance_fn_t g_dot_product = dot_product_scalar; +static vector_distance_fn_t g_manhattan = manhattan_scalar; + +static vector_distance_fn_t g_euclidean_narrow = euclidean_scalar; +static vector_distance_fn_t g_cosine_narrow = cosine_scalar; +static vector_distance_fn_t g_dot_product_narrow = dot_product_scalar; +static vector_distance_fn_t g_manhattan_narrow = manhattan_scalar; + +static VectorDistanceTier g_wide_tier = VectorDistanceTier::Scalar; +static VectorDistanceTier g_narrow_tier = VectorDistanceTier::Scalar; + +// --------------------------------------------------------------------------- +// SIMD kernels — see file header for the full tier table and dispatch order. +// --------------------------------------------------------------------------- + +#ifdef VECTOR_DISTANCE_x86_64 + +#include + +#if defined(__GNUC__) || defined(__clang__) +#include +#endif + +// CPU feature detection ----------------------------------------------------- + +static bool cpu_has_sse42() { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_cpu_supports("sse4.2"); +#elif defined(_MSC_VER) + int info[4]; + __cpuid(info, 1); + return (info[2] & (1 << 20)) != 0; +#else + return false; +#endif +} + +static bool cpu_has_avx2_fma() { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma"); +#elif defined(_MSC_VER) + int info[4]; + __cpuidex(info, 7, 0); + const bool has_avx2 = (info[1] & (1 << 5)) != 0; + __cpuid(info, 1); + const bool has_fma = (info[2] & (1 << 12)) != 0; + return has_avx2 && has_fma; +#else + return false; +#endif +} + +// Mirrors hnswlib AVX512Capable(): combines the AVX-512F CPUID bit with an +// OSXSAVE + XCR0 check so we never dispatch to AVX-512 on an OS that does +// not context-switch the upper ZMM/opmask state. +static bool cpu_has_avx512f() { +#if defined(__GNUC__) || defined(__clang__) + if (!__builtin_cpu_supports("avx512f")) return false; + unsigned int eax, ebx, ecx, edx; + __cpuid_count(1, 0, eax, ebx, ecx, edx); + if ((ecx & (1U << 27)) == 0) return false; // OSXSAVE + unsigned int xcr_lo, xcr_hi; + __asm__ volatile("xgetbv" : "=a"(xcr_lo), "=d"(xcr_hi) : "c"(0)); + const uint64_t xcr0 = (static_cast(xcr_hi) << 32) | xcr_lo; + // SSE(1) | YMM(2) | opmask(5) | ZMM_HI256(6) | HI16_ZMM(7) = 0xe6. + return (xcr0 & 0xe6U) == 0xe6U; +#elif defined(_MSC_VER) + int info[4]; + __cpuid(info, 1); + if ((info[2] & (1 << 27)) == 0) return false; + __cpuidex(info, 7, 0); + if ((info[1] & (1 << 16)) == 0) return false; + const unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0xe6) == 0xe6; +#else + return false; +#endif +} + +// SIMD loops accumulate in float32 registers for full vector width. +// Each kernel promotes to double once (horizontal sum + scalar tail); +// float32-only reduction would not be faster and loses precision. +// However, individual float32 products (d*d for Euclidean, a[i]*b[i] for +// cosine/dot) can overflow to +Inf for extreme float32 inputs (e.g. element +// difference near FLT_MAX). Each kernel checks for a non-finite horizontal +// sum and falls back to the scalar path, which uses double throughout. + +// Tier 1 — SSE4.2, 4 floats per iteration ----------------------------------- + +MY_ATTRIBUTE((target("sse4.2"))) +static float hsum128_sse(__m128 v) { + __m128 s = _mm_hadd_ps(v, v); + s = _mm_hadd_ps(s, s); + return _mm_cvtss_f32(s); +} + +MY_ATTRIBUTE((target("sse4.2"))) +static double euclidean_sse(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m128 sum = _mm_setzero_ps(); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + __m128 d = _mm_sub_ps(_mm_loadu_ps(a + i), _mm_loadu_ps(b + i)); + sum = _mm_add_ps(sum, _mm_mul_ps(d, d)); + } + double result = hsum128_sse(sum); + if (!std::isfinite(result)) return euclidean_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = av - bv; + result += d * d; + } + return result; +} + +MY_ATTRIBUTE((target("sse4.2"))) +static double cosine_sse(const char *a_raw, const char *b_raw, uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m128 vab = _mm_setzero_ps(); + __m128 vna = _mm_setzero_ps(); + __m128 vnb = _mm_setzero_ps(); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + __m128 va = _mm_loadu_ps(a + i); + __m128 vb_v = _mm_loadu_ps(b + i); + vab = _mm_add_ps(vab, _mm_mul_ps(va, vb_v)); + vna = _mm_add_ps(vna, _mm_mul_ps(va, va)); + vnb = _mm_add_ps(vnb, _mm_mul_ps(vb_v, vb_v)); + } + double ab = hsum128_sse(vab); + double norm_a = hsum128_sse(vna); + double norm_b = hsum128_sse(vnb); + if (!std::isfinite(ab) || !std::isfinite(norm_a) || !std::isfinite(norm_b)) + return cosine_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + norm_a += (double)av * av; + norm_b += (double)bv * bv; + } + const double denom = sqrt(norm_a * norm_b); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +MY_ATTRIBUTE((target("sse4.2"))) +static double dot_product_sse(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m128 vab = _mm_setzero_ps(); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) + vab = _mm_add_ps(vab, _mm_mul_ps(_mm_loadu_ps(a + i), _mm_loadu_ps(b + i))); + double ab = hsum128_sse(vab); + if (!std::isfinite(ab)) return dot_product_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + } + return ab; +} + +MY_ATTRIBUTE((target("sse4.2"))) +static double manhattan_sse(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + const __m128 sign_mask = _mm_set1_ps(-0.0f); + __m128 sum = _mm_setzero_ps(); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + __m128 d = _mm_sub_ps(_mm_loadu_ps(a + i), _mm_loadu_ps(b + i)); + sum = _mm_add_ps(sum, _mm_andnot_ps(sign_mask, d)); + } + double result = hsum128_sse(sum); + if (!std::isfinite(result)) return manhattan_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + result += std::fabs((double)av - bv); + } + return result; +} + +// Tier 2 — AVX2 + FMA, 8 floats per iteration ------------------------------- + +MY_ATTRIBUTE((target("avx2,fma"))) +static double euclidean_avx2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m256 sum = _mm256_setzero_ps(); + uint32_t i = 0; + for (; i + 8 <= dims; i += 8) { + __m256 d = _mm256_sub_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)); + sum = _mm256_fmadd_ps(d, d, sum); + } + __m128 lo = _mm256_castps256_ps128(sum); + __m128 hi = _mm256_extractf128_ps(sum, 1); + __m128 s = _mm_add_ps(lo, hi); + s = _mm_hadd_ps(s, s); + s = _mm_hadd_ps(s, s); + double result = _mm_cvtss_f32(s); + if (!std::isfinite(result)) return euclidean_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = av - bv; + result += d * d; + } + return result; +} + +// Lambdas don't inherit a function's target attribute in GCC; use a static +// helper so _mm_hadd_ps and friends are compiled with the AVX2 ISA. +MY_ATTRIBUTE((target("avx2"))) +static float hsum256(__m256 v) { + __m128 lo = _mm256_castps256_ps128(v); + __m128 hi = _mm256_extractf128_ps(v, 1); + __m128 s = _mm_add_ps(lo, hi); + s = _mm_hadd_ps(s, s); + s = _mm_hadd_ps(s, s); + return _mm_cvtss_f32(s); +} + +MY_ATTRIBUTE((target("avx2,fma"))) +static double cosine_avx2(const char *a_raw, const char *b_raw, uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m256 vab = _mm256_setzero_ps(); + __m256 vna = _mm256_setzero_ps(); + __m256 vnb = _mm256_setzero_ps(); + uint32_t i = 0; + for (; i + 8 <= dims; i += 8) { + __m256 va = _mm256_loadu_ps(a + i); + __m256 vb_v = _mm256_loadu_ps(b + i); + vab = _mm256_fmadd_ps(va, vb_v, vab); + vna = _mm256_fmadd_ps(va, va, vna); + vnb = _mm256_fmadd_ps(vb_v, vb_v, vnb); + } + double ab = hsum256(vab); + double norm_a = hsum256(vna); + double norm_b = hsum256(vnb); + if (!std::isfinite(ab) || !std::isfinite(norm_a) || !std::isfinite(norm_b)) + return cosine_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + norm_a += (double)av * av; + norm_b += (double)bv * bv; + } + const double denom = sqrt(norm_a * norm_b); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +MY_ATTRIBUTE((target("avx2,fma"))) +static double dot_product_avx2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m256 vab = _mm256_setzero_ps(); + uint32_t i = 0; + for (; i + 8 <= dims; i += 8) + vab = _mm256_fmadd_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i), vab); + double ab = hsum256(vab); + if (!std::isfinite(ab)) return dot_product_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + } + return ab; +} + +MY_ATTRIBUTE((target("avx2"))) +static double manhattan_avx2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + const __m256 sign_mask = _mm256_set1_ps(-0.0f); + __m256 sum = _mm256_setzero_ps(); + uint32_t i = 0; + for (; i + 8 <= dims; i += 8) { + __m256 d = _mm256_sub_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)); + sum = _mm256_add_ps(sum, _mm256_andnot_ps(sign_mask, d)); + } + double result = hsum256(sum); + if (!std::isfinite(result)) return manhattan_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + result += std::fabs((double)av - bv); + } + return result; +} + +// Tier 3 — AVX-512F, 16 floats per iteration -------------------------------- + +MY_ATTRIBUTE((target("avx512f"))) +static double euclidean_avx512(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m512 sum = _mm512_setzero_ps(); + uint32_t i = 0; + for (; i + 16 <= dims; i += 16) { + __m512 d = _mm512_sub_ps(_mm512_loadu_ps(a + i), _mm512_loadu_ps(b + i)); + sum = _mm512_fmadd_ps(d, d, sum); + } + double result = _mm512_reduce_add_ps(sum); + if (!std::isfinite(result)) return euclidean_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = av - bv; + result += d * d; + } + return result; +} + +MY_ATTRIBUTE((target("avx512f"))) +static double cosine_avx512(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m512 vab = _mm512_setzero_ps(); + __m512 vna = _mm512_setzero_ps(); + __m512 vnb = _mm512_setzero_ps(); + uint32_t i = 0; + for (; i + 16 <= dims; i += 16) { + __m512 va = _mm512_loadu_ps(a + i); + __m512 vb_v = _mm512_loadu_ps(b + i); + vab = _mm512_fmadd_ps(va, vb_v, vab); + vna = _mm512_fmadd_ps(va, va, vna); + vnb = _mm512_fmadd_ps(vb_v, vb_v, vnb); + } + double ab = _mm512_reduce_add_ps(vab); + double norm_a = _mm512_reduce_add_ps(vna); + double norm_b = _mm512_reduce_add_ps(vnb); + if (!std::isfinite(ab) || !std::isfinite(norm_a) || !std::isfinite(norm_b)) + return cosine_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + norm_a += (double)av * av; + norm_b += (double)bv * bv; + } + const double denom = sqrt(norm_a * norm_b); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +MY_ATTRIBUTE((target("avx512f"))) +static double dot_product_avx512(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m512 vab = _mm512_setzero_ps(); + uint32_t i = 0; + for (; i + 16 <= dims; i += 16) + vab = _mm512_fmadd_ps(_mm512_loadu_ps(a + i), _mm512_loadu_ps(b + i), vab); + double ab = _mm512_reduce_add_ps(vab); + if (!std::isfinite(ab)) return dot_product_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + } + return ab; +} + +MY_ATTRIBUTE((target("avx512f"))) +static double manhattan_avx512(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + __m512 sum = _mm512_setzero_ps(); + uint32_t i = 0; + for (; i + 16 <= dims; i += 16) { + __m512 d = _mm512_sub_ps(_mm512_loadu_ps(a + i), _mm512_loadu_ps(b + i)); + sum = _mm512_add_ps(sum, _mm512_abs_ps(d)); + } + double result = _mm512_reduce_add_ps(sum); + if (!std::isfinite(result)) return manhattan_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + result += std::fabs((double)av - bv); + } + return result; +} + +#endif // VECTOR_DISTANCE_x86_64 + +#ifdef VECTOR_DISTANCE_AARCH64 + +#include + +// Tier 1 — NEON (Advanced SIMD), 4 floats per iteration --------------------- + +MY_ATTRIBUTE((target("+simd"))) +static double euclidean_neon(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + float32x4_t sum = vdupq_n_f32(0.0f); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + float32x4_t d = vsubq_f32(vld1q_f32(a + i), vld1q_f32(b + i)); + sum = vmlaq_f32(sum, d, d); + } + float32x2_t s = vadd_f32(vget_low_f32(sum), vget_high_f32(sum)); + s = vpadd_f32(s, s); + double result = vget_lane_f32(s, 0); + if (!std::isfinite(result)) return euclidean_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + const double d = av - bv; + result += d * d; + } + return result; +} + +// Same lambda issue applies on NEON; extract as a static attributed helper. +MY_ATTRIBUTE((target("+simd"))) +static float hsum4(float32x4_t v) { + float32x2_t s = vadd_f32(vget_low_f32(v), vget_high_f32(v)); + s = vpadd_f32(s, s); + return vget_lane_f32(s, 0); +} + +MY_ATTRIBUTE((target("+simd"))) +static double cosine_neon(const char *a_raw, const char *b_raw, uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + float32x4_t vab = vdupq_n_f32(0.0f); + float32x4_t vna = vdupq_n_f32(0.0f); + float32x4_t vnb = vdupq_n_f32(0.0f); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + float32x4_t va = vld1q_f32(a + i); + float32x4_t vb_v = vld1q_f32(b + i); + vab = vmlaq_f32(vab, va, vb_v); + vna = vmlaq_f32(vna, va, va); + vnb = vmlaq_f32(vnb, vb_v, vb_v); + } + double ab = hsum4(vab); + double norm_a = hsum4(vna); + double norm_b = hsum4(vnb); + if (!std::isfinite(ab) || !std::isfinite(norm_a) || !std::isfinite(norm_b)) + return cosine_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + norm_a += (double)av * av; + norm_b += (double)bv * bv; + } + const double denom = sqrt(norm_a * norm_b); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +MY_ATTRIBUTE((target("+simd"))) +static double dot_product_neon(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + float32x4_t vab = vdupq_n_f32(0.0f); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) + vab = vmlaq_f32(vab, vld1q_f32(a + i), vld1q_f32(b + i)); + double ab = hsum4(vab); + if (!std::isfinite(ab)) return dot_product_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + ab += (double)av * bv; + } + return ab; +} + +MY_ATTRIBUTE((target("+simd"))) +static double manhattan_neon(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + float32x4_t sum = vdupq_n_f32(0.0f); + uint32_t i = 0; + for (; i + 4 <= dims; i += 4) { + float32x4_t d = vsubq_f32(vld1q_f32(a + i), vld1q_f32(b + i)); + sum = vaddq_f32(sum, vabsq_f32(d)); + } + double result = hsum4(sum); + if (!std::isfinite(result)) return manhattan_scalar(a_raw, b_raw, dims); + for (; i < dims; i++) { + float av, bv; + memcpy(&av, a_raw + i * sizeof(float), sizeof(float)); + memcpy(&bv, b_raw + i * sizeof(float), sizeof(float)); + result += std::fabs((double)av - bv); + } + return result; +} + +#ifdef VECTOR_DISTANCE_HAS_SVE2 + +#include +#include + +// HWCAP2_SVE2 may not be exposed by older libc headers; the bit is stable +// in the Linux kernel UAPI (linux/include/uapi/asm-generic/hwcap.h). +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1UL << 1) +#endif + +// Tier 3 — SVE2, scalable (VLA) — predicated loads handle any alignment ----- + +static bool cpu_has_sve2() { +#if defined(__linux__) + return (getauxval(AT_HWCAP2) & HWCAP2_SVE2) != 0; +#else + return false; +#endif +} + +MY_ATTRIBUTE((target("+sve2"))) +static double euclidean_sve2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + svfloat32_t sum = svdup_n_f32(0.0f); + uint32_t i = 0; + svbool_t pg = svwhilelt_b32_u32(i, dims); + while (svptest_first(svptrue_b32(), pg)) { + svfloat32_t va = svld1_f32(pg, a + i); + svfloat32_t vb_v = svld1_f32(pg, b + i); + svfloat32_t d = svsub_f32_x(pg, va, vb_v); + // Merging form keeps inactive lanes of sum unchanged on the tail. + sum = svmla_f32_m(pg, sum, d, d); + i += svcntw(); + pg = svwhilelt_b32_u32(i, dims); + } + const double result = svaddv_f32(svptrue_b32(), sum); + if (!std::isfinite(result)) return euclidean_scalar(a_raw, b_raw, dims); + return result; +} + +MY_ATTRIBUTE((target("+sve2"))) +static double cosine_sve2(const char *a_raw, const char *b_raw, uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + svfloat32_t vab = svdup_n_f32(0.0f); + svfloat32_t vna = svdup_n_f32(0.0f); + svfloat32_t vnb = svdup_n_f32(0.0f); + uint32_t i = 0; + svbool_t pg = svwhilelt_b32_u32(i, dims); + while (svptest_first(svptrue_b32(), pg)) { + svfloat32_t va = svld1_f32(pg, a + i); + svfloat32_t vb_v = svld1_f32(pg, b + i); + vab = svmla_f32_m(pg, vab, va, vb_v); + vna = svmla_f32_m(pg, vna, va, va); + vnb = svmla_f32_m(pg, vnb, vb_v, vb_v); + i += svcntw(); + pg = svwhilelt_b32_u32(i, dims); + } + const double ab = svaddv_f32(svptrue_b32(), vab); + const double norm_a = svaddv_f32(svptrue_b32(), vna); + const double norm_b = svaddv_f32(svptrue_b32(), vnb); + if (!std::isfinite(ab) || !std::isfinite(norm_a) || !std::isfinite(norm_b)) + return cosine_scalar(a_raw, b_raw, dims); + const double denom = sqrt(norm_a * norm_b); + if (denom == 0.0) return std::numeric_limits::infinity(); + return 1.0 - ab / denom; +} + +MY_ATTRIBUTE((target("+sve2"))) +static double dot_product_sve2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + svfloat32_t vab = svdup_n_f32(0.0f); + uint32_t i = 0; + svbool_t pg = svwhilelt_b32_u32(i, dims); + while (svptest_first(svptrue_b32(), pg)) { + svfloat32_t va = svld1_f32(pg, a + i); + svfloat32_t vb_v = svld1_f32(pg, b + i); + vab = svmla_f32_m(pg, vab, va, vb_v); + i += svcntw(); + pg = svwhilelt_b32_u32(i, dims); + } + const double ab = svaddv_f32(svptrue_b32(), vab); + if (!std::isfinite(ab)) return dot_product_scalar(a_raw, b_raw, dims); + return ab; +} + +MY_ATTRIBUTE((target("+sve2"))) +static double manhattan_sve2(const char *a_raw, const char *b_raw, + uint32_t dims) { + const float *a = reinterpret_cast(a_raw); + const float *b = reinterpret_cast(b_raw); + svfloat32_t sum = svdup_n_f32(0.0f); + uint32_t i = 0; + svbool_t pg = svwhilelt_b32_u32(i, dims); + while (svptest_first(svptrue_b32(), pg)) { + svfloat32_t va = svld1_f32(pg, a + i); + svfloat32_t vb_v = svld1_f32(pg, b + i); + svfloat32_t d = svsub_f32_x(pg, va, vb_v); + sum = svadd_f32_m(pg, sum, svabs_f32_x(pg, d)); + i += svcntw(); + pg = svwhilelt_b32_u32(i, dims); + } + const double result = svaddv_f32(svptrue_b32(), sum); + if (!std::isfinite(result)) return manhattan_scalar(a_raw, b_raw, dims); + return result; +} + +#endif // VECTOR_DISTANCE_HAS_SVE2 + +#endif // VECTOR_DISTANCE_AARCH64 + +// Tier dispatch helpers — must follow all kernel definitions above. + +static void apply_wide_tier(VectorDistanceTier tier) { + g_wide_tier = tier; + switch (tier) { + case VectorDistanceTier::Scalar: + g_euclidean = euclidean_scalar; + g_cosine = cosine_scalar; + g_dot_product = dot_product_scalar; + g_manhattan = manhattan_scalar; + break; +#ifdef VECTOR_DISTANCE_x86_64 + case VectorDistanceTier::Sse42: + g_euclidean = euclidean_sse; + g_cosine = cosine_sse; + g_dot_product = dot_product_sse; + g_manhattan = manhattan_sse; + break; + case VectorDistanceTier::Avx2: + g_euclidean = euclidean_avx2; + g_cosine = cosine_avx2; + g_dot_product = dot_product_avx2; + g_manhattan = manhattan_avx2; + break; + case VectorDistanceTier::Avx512f: + g_euclidean = euclidean_avx512; + g_cosine = cosine_avx512; + g_dot_product = dot_product_avx512; + g_manhattan = manhattan_avx512; + break; +#endif +#ifdef VECTOR_DISTANCE_AARCH64 + case VectorDistanceTier::Neon: + g_euclidean = euclidean_neon; + g_cosine = cosine_neon; + g_dot_product = dot_product_neon; + g_manhattan = manhattan_neon; + break; +#ifdef VECTOR_DISTANCE_HAS_SVE2 + case VectorDistanceTier::Sve2: + g_euclidean = euclidean_sve2; + g_cosine = cosine_sve2; + g_dot_product = dot_product_sve2; + g_manhattan = manhattan_sve2; + break; +#endif +#endif + default: + break; + } +} + +static void apply_narrow_tier(VectorDistanceTier tier) { + g_narrow_tier = tier; + switch (tier) { + case VectorDistanceTier::Scalar: + g_euclidean_narrow = euclidean_scalar; + g_cosine_narrow = cosine_scalar; + g_dot_product_narrow = dot_product_scalar; + g_manhattan_narrow = manhattan_scalar; + break; +#ifdef VECTOR_DISTANCE_x86_64 + case VectorDistanceTier::Sse42: + g_euclidean_narrow = euclidean_sse; + g_cosine_narrow = cosine_sse; + g_dot_product_narrow = dot_product_sse; + g_manhattan_narrow = manhattan_sse; + break; +#endif +#ifdef VECTOR_DISTANCE_AARCH64 + case VectorDistanceTier::Neon: + g_euclidean_narrow = euclidean_neon; + g_cosine_narrow = cosine_neon; + g_dot_product_narrow = dot_product_neon; + g_manhattan_narrow = manhattan_neon; + break; +#ifdef VECTOR_DISTANCE_HAS_SVE2 + case VectorDistanceTier::Sve2: + g_euclidean_narrow = euclidean_sve2; + g_cosine_narrow = cosine_sve2; + g_dot_product_narrow = dot_product_sve2; + g_manhattan_narrow = manhattan_sve2; + break; +#endif +#endif + default: + break; + } +} + +// init_vector_distance_functions — promote g_* and g_*_narrow (see file header) + +void init_vector_distance_functions() { + apply_wide_tier(VectorDistanceTier::Scalar); + apply_narrow_tier(VectorDistanceTier::Scalar); +#ifdef VECTOR_DISTANCE_x86_64 + // x86_64: Tier 3 -> Tier 2 -> Tier 1 -> Tier 0 (wide) + if (cpu_has_avx512f()) { + apply_wide_tier(VectorDistanceTier::Avx512f); + } else if (cpu_has_avx2_fma()) { + apply_wide_tier(VectorDistanceTier::Avx2); + } else if (cpu_has_sse42()) { + apply_wide_tier(VectorDistanceTier::Sse42); + } + // Narrow path: SSE4.2 fills at dim ≥ 4; use it when available. + if (cpu_has_sse42()) { + apply_narrow_tier(VectorDistanceTier::Sse42); + } +#endif +#ifdef VECTOR_DISTANCE_AARCH64 + // Tier 1 (NEON), optionally Tier 3 (SVE2) + apply_wide_tier(VectorDistanceTier::Neon); + apply_narrow_tier(VectorDistanceTier::Neon); +#ifdef VECTOR_DISTANCE_HAS_SVE2 + if (cpu_has_sve2()) { // Tier 3 over Tier 1 (wide only) + apply_wide_tier(VectorDistanceTier::Sve2); + } +#endif +#endif +} + +bool vector_distance_tier_available(VectorDistanceTier tier) { + switch (tier) { + case VectorDistanceTier::Scalar: + return true; +#ifdef VECTOR_DISTANCE_x86_64 + case VectorDistanceTier::Sse42: + return cpu_has_sse42(); + case VectorDistanceTier::Avx2: + return cpu_has_avx2_fma(); + case VectorDistanceTier::Avx512f: + return cpu_has_avx512f(); +#endif +#ifdef VECTOR_DISTANCE_AARCH64 + case VectorDistanceTier::Neon: + return true; // mandatory on ARMv8 + case VectorDistanceTier::Sve2: +#ifdef VECTOR_DISTANCE_HAS_SVE2 + return cpu_has_sve2(); +#else + return false; +#endif +#endif + default: + return false; + } +} + +void init_vector_distance_functions_tier(VectorDistanceTier tier) { + switch (tier) { + case VectorDistanceTier::Scalar: + apply_wide_tier(VectorDistanceTier::Scalar); + apply_narrow_tier(VectorDistanceTier::Scalar); + break; +#ifdef VECTOR_DISTANCE_x86_64 + case VectorDistanceTier::Sse42: + apply_wide_tier(VectorDistanceTier::Sse42); + apply_narrow_tier(VectorDistanceTier::Sse42); + break; + case VectorDistanceTier::Avx2: + apply_wide_tier(VectorDistanceTier::Avx2); + if (cpu_has_sse42()) apply_narrow_tier(VectorDistanceTier::Sse42); + break; + case VectorDistanceTier::Avx512f: + apply_wide_tier(VectorDistanceTier::Avx512f); + if (cpu_has_sse42()) apply_narrow_tier(VectorDistanceTier::Sse42); + break; +#endif +#ifdef VECTOR_DISTANCE_AARCH64 + case VectorDistanceTier::Neon: + apply_wide_tier(VectorDistanceTier::Neon); + apply_narrow_tier(VectorDistanceTier::Neon); + break; +#ifdef VECTOR_DISTANCE_HAS_SVE2 + case VectorDistanceTier::Sve2: + apply_wide_tier(VectorDistanceTier::Sve2); + apply_narrow_tier(VectorDistanceTier::Neon); + break; +#endif +#endif + default: + break; // unsupported tier on this build; callers must check first + } +} + +VectorDistanceTier vector_distance_wide_tier() { return g_wide_tier; } + +VectorDistanceTier vector_distance_narrow_tier() { return g_narrow_tier; } + +const char *vector_distance_tier_label(VectorDistanceTier tier) { + switch (tier) { + case VectorDistanceTier::Scalar: + return "software scalar"; + case VectorDistanceTier::Sse42: + return "SSE4.2"; + case VectorDistanceTier::Avx2: + return "AVX2 and FMA"; + case VectorDistanceTier::Avx512f: + return "AVX-512F"; + case VectorDistanceTier::Neon: + return "NEON"; + case VectorDistanceTier::Sve2: + return "SVE2"; + } + return "unknown"; +} + +size_t vector_distance_dispatch_description(char *buf, size_t buf_len) { + if (buf == nullptr || buf_len == 0) return 0; + + const VectorDistanceTier wide = g_wide_tier; + const VectorDistanceTier narrow = g_narrow_tier; + int n; + if (wide == narrow) { + if (wide == VectorDistanceTier::Scalar) { + n = snprintf(buf, buf_len, "Using software scalar for DISTANCE()."); + } else { + n = snprintf(buf, buf_len, + "Using hardware accelerated %s for DISTANCE().", + vector_distance_tier_label(wide)); + } + } else { + n = snprintf( + buf, buf_len, + "Using hardware accelerated %s for DISTANCE() " + "(dimensions >= %u) and %s (dimensions < %u).", + vector_distance_tier_label(wide), VECTOR_DISTANCE_WIDE_MIN_DIMS, + vector_distance_tier_label(narrow), VECTOR_DISTANCE_WIDE_MIN_DIMS); + } + if (n < 0) { + buf[0] = '\0'; + return 0; + } + if (static_cast(n) >= buf_len) return buf_len - 1; + return static_cast(n); +} + +// Public wrappers — dim-aware dispatch: narrow path for dims < +// VECTOR_DISTANCE_WIDE_MIN_DIMS avoids sending small inputs to AVX-512/AVX2 +// where the SIMD loop cannot fire. + +double vector_distance_euclidean_squared(const char *a, const char *b, + uint32_t dims) { + return (dims < VECTOR_DISTANCE_WIDE_MIN_DIMS ? g_euclidean_narrow + : g_euclidean)(a, b, dims); +} + +double vector_distance_cosine(const char *a, const char *b, uint32_t dims) { + return (dims < VECTOR_DISTANCE_WIDE_MIN_DIMS ? g_cosine_narrow : g_cosine)( + a, b, dims); +} + +double vector_distance_dot(const char *a, const char *b, uint32_t dims) { + return (dims < VECTOR_DISTANCE_WIDE_MIN_DIMS ? g_dot_product_narrow + : g_dot_product)(a, b, dims); +} + +double vector_distance_manhattan(const char *a, const char *b, uint32_t dims) { + return (dims < VECTOR_DISTANCE_WIDE_MIN_DIMS ? g_manhattan_narrow + : g_manhattan)(a, b, dims); +} diff --git a/vector-common/vector_distance.h b/vector-common/vector_distance.h new file mode 100644 index 000000000000..1f575eadc634 --- /dev/null +++ b/vector-common/vector_distance.h @@ -0,0 +1,117 @@ +#ifndef VECTOR_DISTANCE +#define VECTOR_DISTANCE +/* Copyright (c) 2025, Percona and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include +#include + +/** Set SIMD function pointers before first use; safe to call repeatedly. */ +void init_vector_distance_functions(); + +/** + SIMD tier identifiers. Values match the tier table in vector_distance.cc. + Tiers that do not apply to the host platform (e.g. SSE42 on aarch64) are + always reported as unavailable by vector_distance_tier_available(). +*/ +enum class VectorDistanceTier { + Scalar = 0, ///< Tier 0 — plain C++ scalar + Sse42, ///< Tier 1 x86_64 — SSE4.2, 128-bit + Avx2, ///< Tier 2 x86_64 — AVX2 + FMA, 256-bit + Avx512f, ///< Tier 3 x86_64 — AVX-512F, 512-bit + Neon, ///< Tier 1 aarch64 — NEON, 128-bit + Sve2, ///< Tier 3 aarch64 — SVE2, scalable (VLA) +}; + +/** + Return true when @p tier can be used on the current CPU and build. + Scalar always returns true. Architecture-specific tiers return false on the + wrong platform. SVE2 additionally requires the binary to have been compiled + with __ARM_FEATURE_SVE2 and the OS kernel to advertise HWCAP2_SVE2. +*/ +bool vector_distance_tier_available(VectorDistanceTier tier); + +/** + Set all dispatch pointers (wide and narrow) directly to @p tier without + call_once protection. Intended for benchmarks that want to force a specific + tier on a host that may support a higher one. Use + VectorDistanceTier::Scalar to force scalar kernels. Callers must verify + vector_distance_tier_available() first. +*/ +void init_vector_distance_functions_tier(VectorDistanceTier tier); + +/** Minimum dims for the wide SIMD kernel; below this, g_*_narrow is used. */ +static constexpr uint32_t VECTOR_DISTANCE_WIDE_MIN_DIMS = 16; + +/** Wide-path tier selected by init_vector_distance_functions() (dims >= + * VECTOR_DISTANCE_WIDE_MIN_DIMS). */ +VectorDistanceTier vector_distance_wide_tier(); + +/** Narrow-path tier selected by init_vector_distance_functions() (dims < + * VECTOR_DISTANCE_WIDE_MIN_DIMS). */ +VectorDistanceTier vector_distance_narrow_tier(); + +/** + Stable English label for logs/tests, e.g. "AVX-512F", "AVX2 and FMA", + "software scalar". +*/ +const char *vector_distance_tier_label(VectorDistanceTier tier); + +/** + Write a human-readable dispatch summary into @p buf (NUL-terminated). + Returns the number of bytes written, excluding the terminating NUL. +*/ +size_t vector_distance_dispatch_description(char *buf, size_t buf_len); + +/** + Compute squared Euclidean distance: sum((a[i]-b[i])²). + No sqrt — intended for ranking and as the shared kernel for SQL EUCLIDEAN + (where sqrt is applied in Item_func_vector_distance::val_real). + Accepts any byte alignment; SIMD kernels use unaligned loads internally. + Returns double; SIMD accumulates in float32, reduction uses double for + precision on large dims and extreme float32 values. +*/ +double vector_distance_euclidean_squared(const char *a, const char *b, + uint32_t dims); + +/** + Compute cosine distance between two float vectors encoded as raw bytes. + Returns +Inf when either vector is all-zeros (undefined cosine — true + zero-denominator); returns NaN when input elements are NaN/Inf (bad-data + propagation). The caller must distinguish these two cases. + Returns double; SIMD accumulates in float32, reduction uses double for + precision on large dims and extreme float32 values. +*/ +double vector_distance_cosine(const char *a, const char *b, uint32_t dims); + +/** + Compute dot product (inner product) between two float vectors encoded as raw + bytes. Returns sum(a[i]*b[i]). Higher values indicate greater similarity. + Always finite for finite inputs — no NaN edge cases. + Returns double; SIMD accumulates in float32, reduction uses double for + precision on large dims and extreme float32 values. +*/ +double vector_distance_dot(const char *a, const char *b, uint32_t dims); + +/** + Compute Manhattan (L1) distance between two float vectors encoded as raw + bytes. Returns sum(|a[i] - b[i]|). Always >= 0 for finite inputs. No NaN + edge cases. + Returns double; SIMD accumulates in float32, reduction uses double for + precision on large dims and extreme float32 values. +*/ +double vector_distance_manhattan(const char *a, const char *b, uint32_t dims); + +#endif // VECTOR_DISTANCE