This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch theta_docs in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit 195c07f6fffdf09d7b913059540ba0c857c71e67 Author: AlexanderSaydakov <[email protected]> AuthorDate: Fri Jan 31 18:40:06 2025 -0800 better wording --- theta/sqlx/theta_sketch_agg_int64_nop.sqlx | 140 +++++++++++++++++++++ theta/sqlx/theta_sketch_get_estimate.sqlx | 6 +- .../sqlx/theta_sketch_get_estimate_and_bounds.sqlx | 2 +- .../theta_sketch_get_estimate_and_bounds_seed.sqlx | 2 +- theta/sqlx/theta_sketch_get_estimate_seed.sqlx | 4 +- 5 files changed, 147 insertions(+), 7 deletions(-) diff --git a/theta/sqlx/theta_sketch_agg_int64_nop.sqlx b/theta/sqlx/theta_sketch_agg_int64_nop.sqlx new file mode 100644 index 0000000..4dd2bd3 --- /dev/null +++ b/theta/sqlx/theta_sketch_agg_int64_nop.sqlx @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true, tags: ["theta", "udfs"] } + +CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(value INT64, params STRUCT<lg_k BYTEINT, seed INT64, p FLOAT64> NOT AGGREGATE) +RETURNS BYTES +LANGUAGE js +OPTIONS ( + library=["${dataform.projectConfig.vars.jsBucket}/theta.mjs"], + description = '''Creates a sketch that represents the cardinality of the given INT64 column. + +Param value: the INT64 column of identifiers. +Param lg_k: the sketch accuracy/size parameter as a BYTEINT in the range [4, 26]. A NULL specifies the default of 12. +Param seed: the seed to be used by the underlying hash function. A NULL specifies the default of 9001. +Param p: up-front sampling probability. A NULL specifies the default of 1.0. +Returns: a Compact, Compressed Theta Sketch, as BYTES. + +For more information: + - https://datasketches.apache.org/docs/Theta/ThetaSketches.html + ''' +) AS R""" +import ModuleFactory from "${dataform.projectConfig.vars.jsBucket}/theta.mjs"; +var Module = await ModuleFactory(); +const default_lg_k = Number(Module.DEFAULT_LG_K); +const default_seed = BigInt(Module.DEFAULT_SEED); +const default_p = 1.0; + +// UDAF interface +export function initialState(params) { + return { + lg_k: params.lg_k == null ? default_lg_k : Number(params.lg_k), + seed: params.seed == null ? default_seed : BigInt(params.seed), + p: params.p == null ? default_p : params.p + }; +} + +export function aggregate(state, value) { + if (value == null) return; +// try { +// if (state.sketch == null) { +// state.sketch = new Module.update_theta_sketch(state.lg_k, state.seed, state.p); +// } +// state.sketch.updateInt64(value); +// } catch (e) { +// if (e.message != null) throw e; +// throw new Error(Module.getExceptionMessage(e)); +// } +} + +export function serialize(state) { + if (state.sketch == null && state.union == null) return state; // for transition deserialize-serialize + return state; +/* try { + // for prior transition deserialize-aggregate + // merge aggregated and serialized state + if (state.sketch != null) { + if (state.serialized != null) { + var u = null; + try { + u = new Module.theta_union(state.lg_k, state.seed); + u.updateWithUpdateSketch(state.sketch); + u.updateWithBytes(state.serialized, state.seed); + state.serialized = u.getResultAsUint8ArrayCompressed(); + } finally { + if (u != null) u.delete(); + } + } else { + state.serialized = state.sketch.serializeAsUint8ArrayCompressed(); + } + state.sketch.delete(); + delete state.sketch; + } else if (state.union != null) { + state.serialized = state.union.getResultAsUint8ArrayCompressed(); + state.union.delete(); + delete state.union; + } + return state; + } catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); + } finally { + if (state.sketch != null) state.sketch.delete(); + if (state.union != null) state.unon.delete(); + }*/ +} + +export function deserialize(state) { + return state; +} + +export function merge(state, other_state) { +/* try { + if (state.union == null) { + state.union = new Module.theta_union(state.lg_k, state.seed); + } + if (state.serialized != null) { + state.union.updateWithBytes(state.serialized, state.seed); + delete state.serialized; + } + if (other_state.serialized != null) { + state.union.updateWithBytes(other_state.serialized, state.seed); + delete other_state.serialized; + } + } catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); + }*/ +} + +export function finalize(state) { +// return serialize(state).serialized + var sketch = null; + try { + sketch = new Module.update_theta_sketch(state.lg_k, state.seed, state.p); + return sketch.serializeAsUint8ArrayCompressed(); + } catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); + } finally { + if (sketch != null) sketch.delete(); + } +} +"""; diff --git a/theta/sqlx/theta_sketch_get_estimate.sqlx b/theta/sqlx/theta_sketch_get_estimate.sqlx index ec0af76..2cdd52a 100644 --- a/theta/sqlx/theta_sketch_get_estimate.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate.sqlx @@ -22,11 +22,11 @@ config { hasOutput: true, tags: ["theta", "udfs"] } CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES) RETURNS FLOAT64 OPTIONS ( - description = '''Gets cardinality estimate and bounds from given sketch. - + description = '''Gets distinct count estimate from a given sketch. + Param sketch: The given sketch to query as BYTES. Defaults: seed = 9001. -Returns: a FLOAT64 value as the cardinality estimate. +Returns: distinct count estimate as FLOAT64. For more information: - https://datasketches.apache.org/docs/Theta/ThetaSketches.html diff --git a/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx b/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx index c2a9e68..12b1266 100644 --- a/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate_and_bounds.sqlx @@ -22,7 +22,7 @@ config { hasOutput: true, tags: ["theta", "udfs"] } CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, num_std_devs BYTEINT) RETURNS STRUCT<estimate FLOAT64, lower_bound FLOAT64, upper_bound FLOAT64> OPTIONS ( - description = '''Gets cardinality estimate and bounds from given sketch. + description = '''Gets distinct count estimate and bounds from a given sketch. Param sketch: The given sketch to query as BYTES. Param num_std_devs: The returned bounds will be based on the statistical confidence interval diff --git a/theta/sqlx/theta_sketch_get_estimate_and_bounds_seed.sqlx b/theta/sqlx/theta_sketch_get_estimate_and_bounds_seed.sqlx index 579edf4..3702e4b 100644 --- a/theta/sqlx/theta_sketch_get_estimate_and_bounds_seed.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate_and_bounds_seed.sqlx @@ -25,7 +25,7 @@ LANGUAGE js OPTIONS ( library=["${dataform.projectConfig.vars.jsBucket}/theta.js"], js_parameter_encoding_mode='STANDARD', - description = '''Gets cardinality estimate and bounds from given sketch. + description = '''Gets distinct count estimate and bounds from a given sketch. Param sketch: The given sketch to query as BYTES. Param num_std_devs: The returned bounds will be based on the statistical confidence interval diff --git a/theta/sqlx/theta_sketch_get_estimate_seed.sqlx b/theta/sqlx/theta_sketch_get_estimate_seed.sqlx index 78163cf..7abde98 100644 --- a/theta/sqlx/theta_sketch_get_estimate_seed.sqlx +++ b/theta/sqlx/theta_sketch_get_estimate_seed.sqlx @@ -25,11 +25,11 @@ LANGUAGE js OPTIONS ( library=["${dataform.projectConfig.vars.jsBucket}/theta.js"], js_parameter_encoding_mode='STANDARD', - description = '''Gets cardinality estimate and bounds from given sketch. + description = '''Gets distinct count estimate from a given sketch. Param sketch: The given sketch to query as BYTES. Param seed: This is used to confirm that the given sketch was configured with the correct seed. -Returns: a FLOAT64 value as the cardinality estimate. +Returns: distinct count estimate as FLOA64. For more information: - https://datasketches.apache.org/docs/Theta/ThetaSketches.html --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
