[
https://issues.apache.org/jira/browse/IMPALA-14566?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18041150#comment-18041150
]
Raghav Jindal edited comment on IMPALA-14566 at 11/27/25 6:44 PM:
------------------------------------------------------------------
I tried to push my code and create a pvt branch but I am not having the
permissions
{code:java}
git push origin semanticsearchtest
remote: Permission to apache/impala.git denied to rajindal8.
fatal: unable to access 'https://github.com/apache/impala.git/': The requested
URL returned error: 403 {code}
Initial Code for vector-functions.h
{code:java}
#ifndef IMPALA_EXPRS_VECTOR_FUNCTIONS_H#define IMPALA_EXPRS_VECTOR_FUNCTIONS_H
#include "udf/udf.h"
namespace impala {
using impala_udf::FunctionContext;using impala_udf::DoubleVal;using
impala_udf::CollectionVal;
class VectorFunctions { public: /// The Return Type for the below distance
functions DOUBLE and I did not use FLOAT because of better precision. //
Distance calculations usually involve square roots which will benefit from
15-17 digit precision in Doubke vs 7 digits in FLOAT. // Value returned from
this Euclidean distance function is either a DOUBLE, or NULL if inputs are
invalid /// ctx is a Function context for memory allocation and error
reporting /// vec1 is the First vector as ARRAY<FLOAT> /// vec2 is the Second
vector as ARRAY<FLOAT> {code}
{code:java}
static DoubleVal EuclideanDistance(FunctionContext* ctx, const
CollectionVal& vec1, const CollectionVal& vec2);{code}
{code:java}
static DoubleVal CosineSimilarity(FunctionContext* ctx, const
CollectionVal& vec1, const CollectionVal& vec2);
/// Prepare function to initialize the function state. static void
VectorDistancePrepare(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope);
/// Close function to clean up the function state. static void
VectorDistanceClose(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope);
private: /// Declaring the Helper functions under private to get a float
value from an array element. /// For ARRAY<FLOAT>, elements are stored as
tuples and this function will extract /// the float value from the tuple at a
given index. /// array_ptr Pointer to the start of the array tuple data ///
index Index of the element to retrieve /// tuple_size Size of each tuple in
bytes /// slot_offset Offset of the float slot within the tuple /// The float
value, or 0.0 if the element is NULL {code}
{code:java}
static float GetFloatFromArray(const uint8_t* array_ptr, int index, int
tuple_size, int slot_offset);
/// Helper function to check if an array element is NULL. {code}
{code:java}
static bool IsArrayElementNull(const uint8_t* array_ptr, int index, int
tuple_size, int null_indicator_offset);};
} // namespace impala
#endif // IMPALA_EXPRS_VECTOR_FUNCTIONS_H {code}
Initial Code for vector-functions-ir.cc
{code:java}
#include "exprs/vector-functions.h"
#include <cmath>#include <cstring>
#include "exprs/scalar-expr.h"#include "runtime/descriptors.h"#include
"runtime/tuple.h"#include "udf/udf-internal.h"#include "util/ubsan.h"
#include "common/names.h"
using namespace impala_udf;using namespace impala;
// Each tuple has a null indicator 1 byte which are Followed by the float value
which has 4 bytesconstexpr int FLOAT_SIZE = sizeof(float);constexpr int
NULL_INDICATOR_SIZE = 1; // One byte for null indicatorconstexpr int
TUPLE_SIZE_FOR_FLOAT_ARRAY = NULL_INDICATOR_SIZE + FLOAT_SIZE;constexpr int
SLOT_OFFSET = NULL_INDICATOR_SIZE; // Float will come after the null
indicator.constexpr int NULL_INDICATOR_OFFSET = 0; // Null indicator is always
present at the start of tuple
float VectorFunctions::GetFloatFromArray(const uint8_t* array_ptr, int index,
int tuple_size, int slot_offset) { const uint8_t* tuple_ptr = array_ptr +
(index * tuple_size); // Check if an element is NULL if
(IsArrayElementNull(array_ptr, index, tuple_size, NULL_INDICATOR_OFFSET)) {
return 0.0f; // Return 0.0 float value for NULL elements } // Extract the
float value from the tuple at a given index. float value; // The below logic
should work as for ex if we want to extract the float value at index 1 //
array_ptr will be the pointer to start , first byte . // Index* tuple size
will be 1 * 5 (tuple has null and float bytes) . Tuple ptr will be start array
ptr + index*tuple size. // Float will be after the above tuple ptr address
where float value is present. To read float of 4 bytes , we will use memcpy
function memcpy(&value, tuple_ptr + slot_offset, sizeof(float)); return
value;}
bool VectorFunctions::IsArrayElementNull(const uint8_t* array_ptr, int index,
int tuple_size, int null_indicator_offset) { const uint8_t* tuple_ptr =
array_ptr + (index * tuple_size); const uint8_t* null_byte = tuple_ptr +
null_indicator_offset; // if Bit 0 = 1 then element is NULL //If Bit 0 = 0
then the element is NOT NULL return (*null_byte & 0x01) != 0;}
DoubleVal VectorFunctions::EuclideanDistance(FunctionContext* ctx, const
CollectionVal& vec1, const CollectionVal& vec2) { // Handle NULL inputs if
(vec1.is_null || vec2.is_null) { return DoubleVal::null(); } // Handle
empty arrays if (vec1.num_tuples == 0 || vec2.num_tuples == 0) {
ctx->SetError("Euclidean distance requires non-empty vectors"); return
DoubleVal::null(); } // Validate that vectors have the same length if
(vec1.num_tuples != vec2.num_tuples) { ctx->SetError("Vectors must have the
same length for distance calculation"); return DoubleVal::null(); } //
Calculate sum of squared differences double sum_squared_diff = 0.0; int
tuple_size = TUPLE_SIZE_FOR_FLOAT_ARRAY; for (int i = 0; i <
vec1.num_tuples; ++i) { float val1 = GetFloatFromArray(vec1.ptr, i,
tuple_size, SLOT_OFFSET); float val2 = GetFloatFromArray(vec2.ptr, i,
tuple_size, SLOT_OFFSET); // Handle NULL elements (represented as 0.0 in
our helper) // In a more sophisticated implementation, we'd track NULLs
separately float diff = val1 - val2; sum_squared_diff +=
static_cast<double>(diff * diff); } // Return square root of sum of squared
differences return DoubleVal(sqrt(sum_squared_diff));}
DoubleVal VectorFunctions::CosineSimilarity(FunctionContext* ctx, const
CollectionVal& vec1, const CollectionVal& vec2) { // Handle NULL inputs if
(vec1.is_null || vec2.is_null) { return DoubleVal::null(); } // Handle
empty arrays if (vec1.num_tuples == 0 || vec2.num_tuples == 0) {
ctx->SetError("Cosine similarity requires non-empty vectors"); return
DoubleVal::null(); } // Validate that vectors have the same length if
(vec1.num_tuples != vec2.num_tuples) { ctx->SetError("Vectors must have the
same length for similarity calculation"); return DoubleVal::null(); }
int tuple_size = TUPLE_SIZE_FOR_FLOAT_ARRAY; // Calculate dot product and
magnitudes double dot_product = 0.0; double mag1_squared = 0.0; double
mag2_squared = 0.0; for (int i = 0; i < vec1.num_tuples; ++i) { float
val1 = GetFloatFromArray(vec1.ptr, i, tuple_size, SLOT_OFFSET); float val2 =
GetFloatFromArray(vec2.ptr, i, tuple_size, SLOT_OFFSET); dot_product +=
static_cast<double>(val1 * val2); mag1_squared += static_cast<double>(val1 *
val1); mag2_squared += static_cast<double>(val2 * val2); } // Calculate
the magnitudes double mag1 = sqrt(mag1_squared); double mag2 =
sqrt(mag2_squared); // Handle zero vectors (division by zero) if (mag1 ==
0.0 || mag2 == 0.0) { // Return NULL to indicate this return
DoubleVal::null(); } // cosine of any angle is always in the range [-1, 1]
double similarity = dot_product / (mag1 * mag2); if (similarity > 1.0)
similarity = 1.0; if (similarity < -1.0) similarity = -1.0; return
DoubleVal(similarity);}
void VectorFunctions::VectorDistancePrepare(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope) {}
void VectorFunctions::VectorDistanceClose(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope) {} {code}
was (Author: JIRAUSER299115):
I tried to push my code and create a pvt branch but I am not having the
permissions
{code:java}
git push origin semanticsearchtest
remote: Permission to apache/impala.git denied to rajindal8.
fatal: unable to access 'https://github.com/apache/impala.git/': The requested
URL returned error: 403 {code}
Initial Code for vector-functions.h
{code:java}
#ifndef IMPALA_EXPRS_VECTOR_FUNCTIONS_H#define IMPALA_EXPRS_VECTOR_FUNCTIONS_H
#include "udf/udf.h"
namespace impala {
using impala_udf::FunctionContext;using impala_udf::DoubleVal;using
impala_udf::CollectionVal;
class VectorFunctions { public: /// The Return Type for the below distance
functions DOUBLE and I did not use FLOAT because of better precision. //
Distance calculations usually involve square roots which will benefit from
15-17 digit precision in Doubke vs 7 digits in FLOAT. // Value returned from
this Euclidean distance function is either a DOUBLE, or NULL if inputs are
invalid /// ctx is a Function context for memory allocation and error
reporting /// vec1 is the First vector as ARRAY<FLOAT> /// vec2 is the Second
vector as ARRAY<FLOAT> static DoubleVal EuclideanDistance(FunctionContext*
ctx, const CollectionVal& vec1, const CollectionVal& vec2);
static DoubleVal CosineSimilarity(FunctionContext* ctx, const
CollectionVal& vec1, const CollectionVal& vec2);
/// Prepare function to initialize the function state. static void
VectorDistancePrepare(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope);
/// Close function to clean up the function state. static void
VectorDistanceClose(FunctionContext* ctx,
FunctionContext::FunctionStateScope scope);
private: /// Declaring the Helper functions under private to get a float
value from an array element. /// For ARRAY<FLOAT>, elements are stored as
tuples and this function will extract /// the float value from the tuple at a
given index. /// array_ptr Pointer to the start of the array tuple data ///
index Index of the element to retrieve /// tuple_size Size of each tuple in
bytes /// slot_offset Offset of the float slot within the tuple /// The float
value, or 0.0 if the element is NULL static float GetFloatFromArray(const
uint8_t* array_ptr, int index, int tuple_size, int slot_offset);
/// Helper function to check if an array element is NULL. static bool
IsArrayElementNull(const uint8_t* array_ptr, int index, int tuple_size,
int null_indicator_offset);};
} // namespace impala
#endif // IMPALA_EXPRS_VECTOR_FUNCTIONS_H {code}
> Add support for cosine similarity function
> ------------------------------------------
>
> Key: IMPALA-14566
> URL: https://issues.apache.org/jira/browse/IMPALA-14566
> Project: IMPALA
> Issue Type: Task
> Reporter: Abhishek Rawat
> Assignee: Raghav Jindal
> Priority: Major
>
> The cosine similarity function measures the angle between two vectors,
> regardless of their length (magnitude). The use cases include measuring text
> similarity and is ideal when the direction (semantic meaning/concept) is more
> important than the magnitude.
> Impala doesn't support a native vector data type yet, so we could possibly
> use an ARRAY<FLOAT> data type for representing vectors.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]