This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch tuple_no_base64 in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit d21837b38a49c97aa08a1e13af93387d4768f96f Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Sep 24 22:37:09 2024 -0700 removed base64 encoding-decoding --- tuple/sqlx/tuple_sketch_int64_a_not_b.sqlx | 2 +- tuple/sqlx/tuple_sketch_int64_a_not_b_seed.sqlx | 11 +- ...qlx => tuple_sketch_int64_filter_low_high.sqlx} | 18 +-- .../tuple_sketch_int64_filter_low_high_seed.sqlx | 9 +- ...x => tuple_sketch_int64_from_theta_sketch.sqlx} | 18 +-- .../tuple_sketch_int64_from_theta_sketch_seed.sqlx | 10 +- .../sqlx/tuple_sketch_int64_get_estimate_seed.sqlx | 10 +- .../tuple_sketch_int64_intersection_seed_mode.sqlx | 12 +- ...tuple_sketch_int64_jaccard_similarity_seed.sqlx | 2 + tuple/sqlx/tuple_sketch_int64_to_string_seed.sqlx | 10 +- .../tuple_sketch_int64_union_lgk_seed_mode.sqlx | 12 +- tuple/test/tuple_sketch_int_test.sql | 72 ++++++++++-- tuple/tuple_sketch_int64.cpp | 123 +++++++-------------- 13 files changed, 143 insertions(+), 166 deletions(-) diff --git a/tuple/sqlx/tuple_sketch_int64_a_not_b.sqlx b/tuple/sqlx/tuple_sketch_int64_a_not_b.sqlx index e8afd6d..e964604 100644 --- a/tuple/sqlx/tuple_sketch_int64_a_not_b.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_a_not_b.sqlx @@ -34,6 +34,6 @@ Returns: a Compact Tuple Sketch as BYTES. For more information: - https://datasketches.apache.org/docs/Tuple/TupleSketches.html ''' -AS ( +) AS ( $BQ_DATASET.tuple_sketch_int64_a_not_b_seed(sketchA, sketchB, NULL) ); diff --git a/tuple/sqlx/tuple_sketch_int64_a_not_b_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_a_not_b_seed.sqlx index e1efede..b069b20 100644 --- a/tuple/sqlx/tuple_sketch_int64_a_not_b_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_a_not_b_seed.sqlx @@ -24,6 +24,7 @@ RETURNS BYTES LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Computes a sketch that represents the scalar set difference of sketchA and not sketchB. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. This function only applies to Tuple Sketches with an INT64 summary column. @@ -37,16 +38,10 @@ For more information: - https://datasketches.apache.org/docs/Tuple/TupleSketches.html ''' ) AS R""" -const default_seed = BigInt(Module.DEFAULT_SEED); try { - var a_not_b = null; - try { - a_not_b = new Module.tuple_a_not_b_int64(seed ? BigInt(seed) : default_seed); - return a_not_b.computeWithB64ReturnB64(sketchA, sketchB, seed ? BigInt(seed) : default_seed); - } finally { - if (a_not_b != null) a_not_b.delete(); - } + return Module.tupleAnotBInt64(sketchA, sketchB, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_filter_low_high.sqlx similarity index 76% copy from tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx copy to tuple/sqlx/tuple_sketch_int64_filter_low_high.sqlx index 3edf784..7f660fb 100644 --- a/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_filter_low_high.sqlx @@ -19,11 +19,9 @@ config { hasOutput: true } -CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, low INT64, high INT64, seed INT64) +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, low INT64, high INT64) RETURNS BYTES -LANGUAGE js OPTIONS ( - library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], description = '''Returns a Tuple Sketch computed from the given sketch filtered by the given low and high values. This returns a compact tuple sketch that contains the subset of rows of the give sketch where the summary column is greater-than or equal to the given low and less-than or equal to the given high. @@ -33,18 +31,12 @@ This function only applies to Tuple Sketches with an INT64 summary column. Param sketch: the given Tuple Sketch. This may not be NULL. Param low: the given low INT64. This may not be NULL. Param high: the given high INT64. This may not be NULL. -Param seed: This is used to confirm that the given sketches were configured with the correct seed. A NULL specifies the default seed = 9001. +Assumed Default Param seed: 9001. Returns: a Compact Tuple Sketch as BYTES. For more information: - https://datasketches.apache.org/docs/Tuple/TupleSketches.html ''' -) AS R""" -var sketchObject = null; -try { - sketchObject = Module.compact_tuple_sketch_int64.deserializeFromB64(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.filterB64(low, high); -} finally { - if (sketchObject != null) sketchObject.delete(); -} -"""; +) AS ( + $BQ_DATASET.tuple_sketch_int64_filter_low_high_seed(sketch, low, high, NULL) +); diff --git a/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx index 3edf784..f0f6edd 100644 --- a/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_filter_low_high_seed.sqlx @@ -24,6 +24,7 @@ RETURNS BYTES LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Returns a Tuple Sketch computed from the given sketch filtered by the given low and high values. This returns a compact tuple sketch that contains the subset of rows of the give sketch where the summary column is greater-than or equal to the given low and less-than or equal to the given high. @@ -42,9 +43,9 @@ For more information: ) AS R""" var sketchObject = null; try { - sketchObject = Module.compact_tuple_sketch_int64.deserializeFromB64(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.filterB64(low, high); -} finally { - if (sketchObject != null) sketchObject.delete(); + return Module.compact_tuple_sketch_int64.filterLowHigh(sketch, Number(low), Number(high), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); +} catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_from_theta_sketch.sqlx similarity index 70% copy from tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx copy to tuple/sqlx/tuple_sketch_int64_from_theta_sketch.sqlx index 8a99ec3..11cc7e8 100644 --- a/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_from_theta_sketch.sqlx @@ -19,28 +19,20 @@ config { hasOutput: true } -CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, value INT64, seed INT64) +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, value INT64) RETURNS BYTES -LANGUAGE js OPTIONS ( - library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], description = '''Converts the given Theta Sketch into a Tuple Sketch with a INT64 summary column set to the given INT64 value. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. Param sketch: the given Theta Sketch. This may not be NULL. Param value: the given INT64 value. This may not be NULL. -Param seed: This is used to confirm that the given sketches were configured with the correct seed. A NULL specifies the default seed = 9001. +Assumed Default Param seed: 9001. Returns: a Tuple Sketch with an INT64 summary column as BYTES. For more information: - https://datasketches.apache.org/docs/Tuple/TupleSketches.html ''' -) AS R""" -var sketchObject = null; -try { - sketchObject = Module.compact_tuple_sketch_int64.convertThetaFromB64(sketch, BigInt(value), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.serializeAsB64(); -} finally { - if (sketchObject != null) sketchObject.delete(); -} -"""; +) AS ( + $BQ_DATASET.tuple_sketch_int64_from_theta_sketch_seed(sketch, value, NULL) +); diff --git a/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx index 8a99ec3..6878e6f 100644 --- a/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_from_theta_sketch_seed.sqlx @@ -24,6 +24,7 @@ RETURNS BYTES LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Converts the given Theta Sketch into a Tuple Sketch with a INT64 summary column set to the given INT64 value. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. @@ -36,11 +37,10 @@ For more information: - https://datasketches.apache.org/docs/Tuple/TupleSketches.html ''' ) AS R""" -var sketchObject = null; try { - sketchObject = Module.compact_tuple_sketch_int64.convertThetaFromB64(sketch, BigInt(value), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.serializeAsB64(); -} finally { - if (sketchObject != null) sketchObject.delete(); + return Module.compact_tuple_sketch_int64.convertTheta(sketch, BigInt(value), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); +} catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_get_estimate_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_get_estimate_seed.sqlx index a04dd9f..2228e37 100644 --- a/tuple/sqlx/tuple_sketch_int64_get_estimate_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_get_estimate_seed.sqlx @@ -24,6 +24,7 @@ RETURNS FLOAT64 LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Returns the cardinality estimate of the given Tuple Sketch. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. This function only applies to Tuple Sketches with an INT64 summary column. @@ -37,14 +38,9 @@ For more information: ''' ) AS R""" try { - var sketchObject = null; - try { - sketchObject = Module.compact_tuple_sketch_int64.deserializeFromB64(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.getEstimate(); - } finally { - if (sketchObject != null) sketchObject.delete(); - } + return Module.compact_tuple_sketch_int64.getEstimate(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_intersection_seed_mode.sqlx b/tuple/sqlx/tuple_sketch_int64_intersection_seed_mode.sqlx index dc9c2be..3e1c42f 100644 --- a/tuple/sqlx/tuple_sketch_int64_intersection_seed_mode.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_intersection_seed_mode.sqlx @@ -24,6 +24,7 @@ RETURNS BYTES LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Computes a sketch that represents the scalar intersection of sketchA and sketchB. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. This function only applies to Tuple Sketches with an INT64 summary column. @@ -39,16 +40,9 @@ For more information: ) AS R""" const default_seed = BigInt(Module.DEFAULT_SEED); try { - var intersection = null; - try { - intersection = new Module.tuple_intersection_int64(seed ? BigInt(seed) : default_seed, mode ? mode : ""); - intersection.updateWithB64(sketchA, seed ? BigInt(seed) : default_seed); - intersection.updateWithB64(sketchB, seed ? BigInt(seed) : default_seed); - return intersection.getResultB64(); - } finally { - if (intersection != null) intersection.delete(); - } + return Module.tupleIntersectionInt64(sketchA, sketchB, seed ? BigInt(seed) : default_seed, mode ? mode : ""); } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_jaccard_similarity_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_jaccard_similarity_seed.sqlx index ef74c43..06412a2 100644 --- a/tuple/sqlx/tuple_sketch_int64_jaccard_similarity_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_jaccard_similarity_seed.sqlx @@ -24,6 +24,7 @@ RETURNS STRUCT<lower_bound FLOAT64, estimate FLOAT64, upper_bound FLOAT64> LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are to each other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are disjoint. @@ -48,6 +49,7 @@ try { upper_bound: jaccard.get(2) }; } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_to_string_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_to_string_seed.sqlx index 7432242..9ee21bc 100644 --- a/tuple/sqlx/tuple_sketch_int64_to_string_seed.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_to_string_seed.sqlx @@ -24,6 +24,7 @@ RETURNS STRING LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Returns a human readable STRING that is a short summary of the state of this sketch. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. This function only applies to Tuple Sketches with an INT64 summary column. @@ -37,14 +38,9 @@ For more information: ''' ) AS R""" try { - var sketchObject = null; - try { - sketchObject = Module.compact_tuple_sketch_int64.deserializeFromB64(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); - return sketchObject.toString(); - } finally { - if (sketchObject != null) sketchObject.delete(); - } + return Module.compact_tuple_sketch_int64.toString(sketch, seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/sqlx/tuple_sketch_int64_union_lgk_seed_mode.sqlx b/tuple/sqlx/tuple_sketch_int64_union_lgk_seed_mode.sqlx index 87156e0..52c3944 100644 --- a/tuple/sqlx/tuple_sketch_int64_union_lgk_seed_mode.sqlx +++ b/tuple/sqlx/tuple_sketch_int64_union_lgk_seed_mode.sqlx @@ -24,6 +24,7 @@ RETURNS BYTES LANGUAGE js OPTIONS ( library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', description = '''Computes a Tuple Sketch that represents the UNION of sketchA and sketchB. Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. This function only applies to Tuple Sketches with an INT64 summary column. @@ -40,16 +41,9 @@ For more information: const default_lg_k = Number(Module.DEFAULT_LG_K); const default_seed = BigInt(Module.DEFAULT_SEED); try { - var union = null; - try { - union = new Module.tuple_union_int64(lg_k ? lg_k : default_lg_k, seed ? BigInt(seed) : default_seed, mode ? mode : ""); - union.updateWithB64(sketchA, seed ? BigInt(seed) : default_seed) - union.updateWithB64(sketchB, seed ? BigInt(seed) : default_seed) - return union.getResultB64(); - } finally { - if (union != null) union.delete(); - } + return Module.tupleUnionInt64(sketchA, sketchB, lg_k ? Number(lg_k) : default_lg_k, seed ? BigInt(seed) : default_seed, mode ? mode : ""); } catch (e) { + if (e.message != null) throw e; throw new Error(Module.getExceptionMessage(e)); } """; diff --git a/tuple/test/tuple_sketch_int_test.sql b/tuple/test/tuple_sketch_int_test.sql index 8712943..01917dd 100644 --- a/tuple/test/tuple_sketch_int_test.sql +++ b/tuple/test/tuple_sketch_int_test.sql @@ -17,13 +17,13 @@ * under the License. */ +# using defaults create or replace table $BQ_DATASET.tuple_sketch(sketch bytes); insert into $BQ_DATASET.tuple_sketch -(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch_seed($BQ_DATASET.theta_sketch_agg_string(cast(value as string)), 1, null) from unnest(GENERATE_ARRAY(1, 10000, 1)) as value); +(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch($BQ_DATASET.theta_sketch_agg_string(cast(value as string)), 1) from unnest(GENERATE_ARRAY(1, 10000, 1)) as value); insert into $BQ_DATASET.tuple_sketch -(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch_seed($BQ_DATASET.theta_sketch_agg_string(cast(value as string)), 1, null) from unnest(GENERATE_ARRAY(100000, 110000, 1)) as value); - +(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch($BQ_DATASET.theta_sketch_agg_string(cast(value as string)), 1) from unnest(GENERATE_ARRAY(100000, 110000, 1)) as value); # expected about 20000 select $BQ_DATASET.tuple_sketch_int64_get_estimate( @@ -37,6 +37,38 @@ select $BQ_DATASET.tuple_sketch_int64_to_string( drop table $BQ_DATASET.tuple_sketch; +# using full signatures +create or replace table $BQ_DATASET.tuple_sketch(sketch bytes); + +insert into $BQ_DATASET.tuple_sketch +(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch_seed( + $BQ_DATASET.theta_sketch_agg_string_lgk_seed_p(cast(value as string), STRUCT<BYTEINT, INT64, FLOAT64>(10, 111, 0.999)), + 1, + 111 +) from unnest(GENERATE_ARRAY(1, 10000, 1)) as value); +insert into $BQ_DATASET.tuple_sketch +(select $BQ_DATASET.tuple_sketch_int64_from_theta_sketch_seed( + $BQ_DATASET.theta_sketch_agg_string_lgk_seed_p(cast(value as string), STRUCT<BYTEINT, INT64, FLOAT64>(10, 111, 0.999)), + 1, + 111 +) from unnest(GENERATE_ARRAY(100000, 110000, 1)) as value); + +# expected about 20000 +select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( + $BQ_DATASET.tuple_sketch_int64_agg_union_lgk_seed_mode(sketch, STRUCT<BYTEINT, INT64, STRING>(10, 111, "NOP")), + 111 +) from $BQ_DATASET.tuple_sketch; + +# expected estimate about 20000 +select $BQ_DATASET.tuple_sketch_int64_to_string_seed( + $BQ_DATASET.tuple_sketch_int64_agg_union_lgk_seed_mode(sketch, STRUCT<BYTEINT, INT64, STRING>(10, 111, "NOP")), + 111 +) from $BQ_DATASET.tuple_sketch; + +drop table $BQ_DATASET.tuple_sketch; + + +# using defaluts # expected 5 select $BQ_DATASET.tuple_sketch_int64_get_estimate( $BQ_DATASET.tuple_sketch_int64_union( @@ -45,7 +77,7 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate( ) ); -# full signatures +# using full signatures # expected 5 select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( $BQ_DATASET.tuple_sketch_int64_union_lgk_seed_mode( @@ -58,6 +90,7 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( 111 ); +# using defaluts # expected 1 select $BQ_DATASET.tuple_sketch_int64_get_estimate( $BQ_DATASET.tuple_sketch_int64_intersection( @@ -66,7 +99,7 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate( ) ); -# full signatures +# using full signatures # expected 1 select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( $BQ_DATASET.tuple_sketch_int64_intersection_seed_mode( @@ -78,6 +111,7 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( 111 ); +# using defaluts # expected 2 select $BQ_DATASET.tuple_sketch_int64_get_estimate( $BQ_DATASET.tuple_sketch_int64_a_not_b( @@ -86,7 +120,7 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate( ) ); -# full signatures +# using full signatures # expected 2 select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( $BQ_DATASET.tuple_sketch_int64_a_not_b_seed( @@ -97,17 +131,39 @@ select $BQ_DATASET.tuple_sketch_int64_get_estimate_seed( 111 ); - +# using defaluts # expected 0.2 select $BQ_DATASET.tuple_sketch_int64_jaccard_similarity( (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from unnest(["a", "b", "c"]) as str), (select $BQ_DATASET.tuple_sketch_int64_agg_string(str, 1) from unnest(["c", "d", "e"]) as str) ); -#full signatures +# using full signatures # expected 0.2 select $BQ_DATASET.tuple_sketch_int64_jaccard_similarity_seed( (select $BQ_DATASET.tuple_sketch_int64_agg_string_lgk_seed_p_mode(str, 1, STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "NOP")) from unnest(["a", "b", "c"]) as str), (select $BQ_DATASET.tuple_sketch_int64_agg_string_lgk_seed_p_mode(str, 1, STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "NOP")) from unnest(["c", "d", "e"]) as str), 111 ); + +# using defaults +# expected 1 entry +select $BQ_DATASET.tuple_sketch_int64_to_string( + $BQ_DATASET.tuple_sketch_int64_filter_low_high( + $BQ_DATASET.tuple_sketch_int64_agg_string(key, 1), + 2, + 2 + ) +) from unnest(["a", "b", "c", "c"]) as key; + +# using full signatures +# expected 1 entry +select $BQ_DATASET.tuple_sketch_int64_to_string_seed( + $BQ_DATASET.tuple_sketch_int64_filter_low_high_seed( + $BQ_DATASET.tuple_sketch_int64_agg_string_lgk_seed_p_mode(key, 1, STRUCT<BYTEINT, INT64, FLOAT64, STRING>(10, 111, 0.999, "SUM")), + 2, + 2, + 111 + ), + 111 +) from unnest(["a", "b", "c", "c"]) as key; diff --git a/tuple/tuple_sketch_int64.cpp b/tuple/tuple_sketch_int64.cpp index 576f4f7..18ae010 100644 --- a/tuple/tuple_sketch_int64.cpp +++ b/tuple/tuple_sketch_int64.cpp @@ -26,8 +26,6 @@ #include <tuple_jaccard_similarity.hpp> #include <theta_sketch.hpp> -#include "../base64.hpp" - using Summary = uint64_t; using Update = uint64_t; @@ -118,39 +116,23 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) { ; emscripten::class_<compact_tuple_sketch_int64>("compact_tuple_sketch_int64") - .class_function("deserializeFromB64", emscripten::optional_override([](const std::string& b64, uint64_t seed) { - std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); - b64_decode(b64.data(), b64.size(), bytes.data()); - return new compact_tuple_sketch_int64(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) - .class_function("deserializeFromBinary", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { - return new compact_tuple_sketch_int64(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) - .class_function("convertThetaFromB64", emscripten::optional_override([](const std::string& b64, uint64_t value, uint64_t seed) { - std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); - b64_decode(b64.data(), b64.size(), bytes.data()); + .class_function("convertTheta", emscripten::optional_override([](const std::string& theta_sketch_bytes, uint64_t value, uint64_t seed) { // converting constructor does not currently take wrapped compact theta sketch - const auto sketch = datasketches::compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed); - return new compact_tuple_sketch_int64(sketch, value); - }), emscripten::allow_raw_pointers()) - .function("getEstimate", emscripten::optional_override([](const compact_tuple_sketch_int64& self) { - return self.get_estimate(); + const auto sketch = datasketches::compact_theta_sketch::deserialize(theta_sketch_bytes.data(), theta_sketch_bytes.size(), seed); + auto bytes = compact_tuple_sketch_int64(sketch, value).serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) - .function("toString", emscripten::optional_override([](const compact_tuple_sketch_int64& self) { - return std::string(self.to_string()); + .class_function("getEstimate", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { + return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_estimate(); })) - .function("serializeAsB64", emscripten::optional_override([](const compact_tuple_sketch_int64& self) { - auto bytes = self.serialize(); - std::vector<char> b64(b64_enc_len(bytes.size())); - b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); - return std::string(b64.data(), b64.size()); + .class_function("toString", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { + return std::string(compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).to_string()); })) - .function("filterB64", emscripten::optional_override([](const compact_tuple_sketch_int64& self, int low, int high) { - auto sketch = self.filter([low, high](int v){return v >= low && v <= high;}); - auto bytes = sketch.serialize(); - std::vector<char> b64(b64_enc_len(bytes.size())); - b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); - return std::string(b64.data(), b64.size()); + .class_function("filterLowHigh", emscripten::optional_override([](const std::string& sketch_bytes, int low, int high, uint64_t seed) { + auto bytes = compact_tuple_sketch_int64::deserialize( + sketch_bytes.data(), sketch_bytes.size(), seed + ).filter([low, high](int v){return v >= low && v <= high;}).serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) ; @@ -165,65 +147,42 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) { .function("updateWithBytes", emscripten::optional_override([](tuple_union_int64& self, const std::string& bytes, uint64_t seed) { self.update(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); }), emscripten::allow_raw_pointers()) - .function("updateWithB64", emscripten::optional_override([](tuple_union_int64& self, const std::string& b64, uint64_t seed) { - std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); - b64_decode(b64.data(), b64.size(), bytes.data()); - self.update(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) .function("getResultAsUint8Array", emscripten::optional_override([](tuple_union_int64& self) { auto bytes = self.get_result().serialize(); return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); })) - .function("getResultB64", emscripten::optional_override([](tuple_union_int64& self) { - auto bytes = self.get_result().serialize(); - std::vector<char> b64(b64_enc_len(bytes.size())); - b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); - return std::string(b64.data(), b64.size()); - })) ; - emscripten::class_<tuple_intersection_int64>("tuple_intersection_int64") - .constructor(emscripten::optional_override([](uint64_t seed, const std::string& mode_str) { - return new tuple_intersection_int64(seed, tuple_intersection_policy<Summary>(convert_mode(mode_str))); - })) - .function("updateWithB64", emscripten::optional_override([](tuple_intersection_int64& self, const std::string& b64, uint64_t seed) { - std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); - b64_decode(b64.data(), b64.size(), bytes.data()); - self.update(compact_tuple_sketch_int64::deserialize(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) - .function("getResultB64", emscripten::optional_override([](tuple_intersection_int64& self) { - auto bytes = self.get_result().serialize(); - std::vector<char> b64(b64_enc_len(bytes.size())); - b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); - return std::string(b64.data(), b64.size()); - })) - ; + emscripten::function("tupleUnionInt64", emscripten::optional_override([]( + const std::string& bytes1, const std::string& bytes2, uint8_t lg_k, uint64_t seed, const std::string& mode_str + ) { + const auto policy = tuple_union_policy<Summary>(convert_mode(mode_str)); + auto u = tuple_union_int64(tuple_union_int64::builder(policy).set_lg_k(lg_k).set_seed(seed).build()); + u.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed)); + u.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed)); + const auto bytes = u.get_result().serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })); - emscripten::class_<tuple_a_not_b_int64>("tuple_a_not_b_int64") - .constructor(emscripten::optional_override([](uint64_t seed) { - return new tuple_a_not_b_int64(seed); - })) - .function("computeWithB64ReturnB64", emscripten::optional_override([](tuple_a_not_b_int64& self, - const std::string& b64_1, const std::string& b64_2, uint64_t seed) { - std::vector<char> bytes1(b64_dec_len(b64_1.data(), b64_1.size())); - b64_decode(b64_1.data(), b64_1.size(), bytes1.data()); - std::vector<char> bytes2(b64_dec_len(b64_2.data(), b64_2.size())); - b64_decode(b64_2.data(), b64_2.size(), bytes2.data()); - auto bytes = self.compute( - compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed), - compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed) - ).serialize(); - std::vector<char> b64(b64_enc_len(bytes.size())); - b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); - return std::string(b64.data(), b64.size()); - })) - ; + emscripten::function("tupleIntersectionInt64", emscripten::optional_override([]( + const std::string& bytes1, const std::string& bytes2, uint64_t seed, const std::string& mode_str + ) { + tuple_intersection_int64 intersection(seed, tuple_intersection_policy<Summary>(convert_mode(mode_str))); + intersection.update(compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed)); + intersection.update(compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed)); + const auto bytes = intersection.get_result().serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })); + + emscripten::function("tupleAnotBInt64", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { + auto bytes = tuple_a_not_b_int64(seed).compute( + compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed), + compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed) + ).serialize(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })); - emscripten::function("tupleInt64JaccardSimilarity", emscripten::optional_override([](const std::string& sketch1_b64, const std::string& sketch2_b64, uint64_t seed) { - std::vector<char> bytes1(b64_dec_len(sketch1_b64.data(), sketch1_b64.size())); - b64_decode(sketch1_b64.data(), sketch1_b64.size(), bytes1.data()); - std::vector<char> bytes2(b64_dec_len(sketch2_b64.data(), sketch2_b64.size())); - b64_decode(sketch2_b64.data(), sketch2_b64.size(), bytes2.data()); + emscripten::function("tupleInt64JaccardSimilarity", emscripten::optional_override([](const std::string& bytes1, const std::string& bytes2, uint64_t seed) { const auto arr = tuple_jaccard_similarity_int64::jaccard( compact_tuple_sketch_int64::deserialize(bytes1.data(), bytes1.size(), seed), compact_tuple_sketch_int64::deserialize(bytes2.data(), bytes2.size(), seed), --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
